diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 6fbf0138bb..6ade7b777f 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -58,6 +58,7 @@ "drone-agentic": "dimos.robot.drone.blueprints.agentic.drone_agentic:drone_agentic", "drone-basic": "dimos.robot.drone.blueprints.basic.drone_basic:drone_basic", "dual-xarm6-planner": "dimos.manipulation.blueprints:dual_xarm6_planner", + "greeter-agentic": "dimos.robot.unitree.go2.blueprints.agentic.greeter_agentic:greeter_agentic", "keyboard-teleop-a750": "dimos.robot.manipulators.a750.blueprints:keyboard_teleop_a750", "keyboard-teleop-openarm": "dimos.robot.manipulators.openarm.blueprints:keyboard_teleop_openarm", "keyboard-teleop-openarm-mock": "dimos.robot.manipulators.openarm.blueprints:keyboard_teleop_openarm_mock", diff --git a/dimos/robot/unitree/go2/blueprints/agentic/greeter_agentic.py b/dimos/robot/unitree/go2/blueprints/agentic/greeter_agentic.py new file mode 100644 index 0000000000..a2a244422c --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/greeter_agentic.py @@ -0,0 +1,62 @@ +"""Version-controlled copy of the `greeter-agentic` `dimos run` blueprint. + +The greeter dog: wander (DimOS patrol) until a person sitting on a chair is +found, pick a random not-yet-greeted one, walk over, circle to face them, and +wave hello -- then resume the patrol. The autonomous loop lives in +``GreeterSkillContainer``; start/stop it via MCP. + +Install it next to the stock agentic blueprints in the DimOS source tree and +regenerate the registry: + + cp integration/greeter_agentic.py \\ + "$DIMOS_HOME/dimos/robot/unitree/go2/blueprints/agentic/" + pytest -o addopts="" \\ + "$DIMOS_HOME/dimos/robot/test_all_blueprints_generation.py" + +Then (with the `pawtrack` package importable): + + PYTHONPATH=src PAWTRACK_MODEL=openai:deepseek-chat \ + dimos run greeter-agentic --robot-ip + dimos mcp call start_greeting # or tell the agent: "go greet people" + dimos mcp call stop_greeting # halt + +This is the stock ``unitree-go2-agentic`` general agent **plus** the greeter +skill container -- i.e. a normal Go2 agent that can navigate, speak, run sport +commands, etc., and *also* knows how to greet. It uses the agent's default +general system prompt (no narrow greeter prompt); the agent discovers the +greeting from the skill docstrings (``start_greeting`` / ``stop_greeting`` / +``greeter_status`` / ``wave_hello``) and calls it when asked, like any other +skill. The wander/greet loop itself is autonomous once started. Model defaults +to the agent's default; override with ``PAWTRACK_MODEL``. +""" + +from __future__ import annotations + +import os + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.core.coordination.blueprints import autoconnect +from dimos.robot.unitree.go2.blueprints.agentic._common_agentic import ( + _common_agentic, +) +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial import ( + unitree_go2_spatial, +) + +from pawtrack.greeter_container import GreeterSkillContainer + +_MODEL = os.environ.get("PAWTRACK_MODEL") + +# Stock unitree-go2-agentic (general agent, default prompt) + the greeter skill. +_agent = ( + McpClient.blueprint(model=_MODEL) if _MODEL else McpClient.blueprint() +) + +greeter_agentic = autoconnect( + unitree_go2_spatial, + McpServer.blueprint(), + _agent, + _common_agentic, + GreeterSkillContainer.blueprint(), +) diff --git a/hackathon/pawtrack/README.md b/hackathon/pawtrack/README.md new file mode 100644 index 0000000000..0e7b90c501 --- /dev/null +++ b/hackathon/pawtrack/README.md @@ -0,0 +1,55 @@ +# PawTrack Greeter Hackathon Demo + +## Overview + +PawTrack adds a DimOS / Unitree Go2 autonomous greeter demo. The robot patrols, detects a target such as a person sitting on a chair, tracks it, approaches to a safe standoff, waves, records the visited location, and resumes patrol. The DimOS blueprint is registered as `greeter-agentic`. + +Source lives in `hackathon/pawtrack/src/pawtrack`. The runnable DimOS blueprint is placed at `dimos/robot/unitree/go2/blueprints/agentic/greeter_agentic.py` and registered in `dimos/robot/all_blueprints.py`. + +## Run + +From this DimOS checkout: + +```bash +export DIMOS_HOME="${DIMOS_HOME:-$HOME/sam/dimos}" +export DIMOS_VENV="${DIMOS_VENV:-$HOME/dimos-env}" +export PYTHONPATH="$DIMOS_HOME/hackathon/pawtrack/src:$DIMOS_HOME:${PYTHONPATH:-}" +source "$DIMOS_VENV/bin/activate" +cd "$DIMOS_HOME" +``` + +Simulation: + +```bash +hackathon/pawtrack/scripts/run_greeter_sim.sh +``` + +Then in another shell: + +```bash +dimos mcp call start_greeting --arg target="a person" +dimos mcp call greeter_status +dimos mcp call stop_greeting +``` + +Real Go2: + +```bash +dimos run greeter-agentic --robot-ip +dimos mcp call start_greeting --arg target="a person sitting on a chair" +dimos mcp call stop_greeting +``` + +## Test + +```bash +export DIMOS_HOME="${DIMOS_HOME:-$HOME/sam/dimos}" +export DIMOS_VENV="${DIMOS_VENV:-$HOME/dimos-env}" +source "$DIMOS_VENV/bin/activate" +cd "$DIMOS_HOME" +PYTHONPATH="$DIMOS_HOME/hackathon/pawtrack/src:$DIMOS_HOME" pytest -o addopts= \ + hackathon/pawtrack/tests/test_identify.py \ + hackathon/pawtrack/tests/test_greeter_state.py \ + hackathon/pawtrack/tests/test_greeter_container.py \ + hackathon/pawtrack/tests/test_occupancy.py +``` diff --git a/hackathon/pawtrack/integration/greeter_agentic.py b/hackathon/pawtrack/integration/greeter_agentic.py new file mode 100644 index 0000000000..a2a244422c --- /dev/null +++ b/hackathon/pawtrack/integration/greeter_agentic.py @@ -0,0 +1,62 @@ +"""Version-controlled copy of the `greeter-agentic` `dimos run` blueprint. + +The greeter dog: wander (DimOS patrol) until a person sitting on a chair is +found, pick a random not-yet-greeted one, walk over, circle to face them, and +wave hello -- then resume the patrol. The autonomous loop lives in +``GreeterSkillContainer``; start/stop it via MCP. + +Install it next to the stock agentic blueprints in the DimOS source tree and +regenerate the registry: + + cp integration/greeter_agentic.py \\ + "$DIMOS_HOME/dimos/robot/unitree/go2/blueprints/agentic/" + pytest -o addopts="" \\ + "$DIMOS_HOME/dimos/robot/test_all_blueprints_generation.py" + +Then (with the `pawtrack` package importable): + + PYTHONPATH=src PAWTRACK_MODEL=openai:deepseek-chat \ + dimos run greeter-agentic --robot-ip + dimos mcp call start_greeting # or tell the agent: "go greet people" + dimos mcp call stop_greeting # halt + +This is the stock ``unitree-go2-agentic`` general agent **plus** the greeter +skill container -- i.e. a normal Go2 agent that can navigate, speak, run sport +commands, etc., and *also* knows how to greet. It uses the agent's default +general system prompt (no narrow greeter prompt); the agent discovers the +greeting from the skill docstrings (``start_greeting`` / ``stop_greeting`` / +``greeter_status`` / ``wave_hello``) and calls it when asked, like any other +skill. The wander/greet loop itself is autonomous once started. Model defaults +to the agent's default; override with ``PAWTRACK_MODEL``. +""" + +from __future__ import annotations + +import os + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.core.coordination.blueprints import autoconnect +from dimos.robot.unitree.go2.blueprints.agentic._common_agentic import ( + _common_agentic, +) +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial import ( + unitree_go2_spatial, +) + +from pawtrack.greeter_container import GreeterSkillContainer + +_MODEL = os.environ.get("PAWTRACK_MODEL") + +# Stock unitree-go2-agentic (general agent, default prompt) + the greeter skill. +_agent = ( + McpClient.blueprint(model=_MODEL) if _MODEL else McpClient.blueprint() +) + +greeter_agentic = autoconnect( + unitree_go2_spatial, + McpServer.blueprint(), + _agent, + _common_agentic, + GreeterSkillContainer.blueprint(), +) diff --git a/hackathon/pawtrack/integration/pawtrack_agentic.py b/hackathon/pawtrack/integration/pawtrack_agentic.py new file mode 100644 index 0000000000..2ff9ee408d --- /dev/null +++ b/hackathon/pawtrack/integration/pawtrack_agentic.py @@ -0,0 +1,46 @@ +"""Version-controlled copy of the PawTrack `dimos run` blueprint. + +`dimos run ` resolves names from a registry that the generator builds by +scanning only the dimos source tree, so the blueprint object has to live inside +that tree. This file is the canonical copy; install it with: + + cp integration/pawtrack_agentic.py \\ + "$DIMOS_HOME/dimos/robot/unitree/go2/blueprints/agentic/" + pytest "$DIMOS_HOME/dimos/robot/test_all_blueprints_generation.py" # regen + +Then (with the `pawtrack` package importable): + + PYTHONPATH=src dimos run pawtrack-agentic --robot-ip + +It rebuilds the four parts of the stock ``unitree-go2-agentic`` blueprint but +swaps in an ``McpClient`` with the PawTrack prompt and adds the PawTrack +subject-tracking skill container -- reusing the stock stack, no duplicate +agent. +""" + +from __future__ import annotations + +import os + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.core.coordination.blueprints import autoconnect +from dimos.robot.unitree.go2.blueprints.agentic._common_agentic import ( + _common_agentic, +) +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial import ( + unitree_go2_spatial, +) + +from pawtrack.skill_container import PawTrackSkillContainer +from pawtrack.system_prompt import PAWTRACK_PROMPT + +_MODEL = os.environ.get("PAWTRACK_MODEL", "openai:gpt-5.1") + +pawtrack_agentic = autoconnect( + unitree_go2_spatial, + McpServer.blueprint(), + McpClient.blueprint(model=_MODEL, system_prompt=PAWTRACK_PROMPT), + _common_agentic, + PawTrackSkillContainer.blueprint(), +) diff --git a/hackathon/pawtrack/integration/pawtrack_agentic_relocalization.py b/hackathon/pawtrack/integration/pawtrack_agentic_relocalization.py new file mode 100644 index 0000000000..f3e19e48bc --- /dev/null +++ b/hackathon/pawtrack/integration/pawtrack_agentic_relocalization.py @@ -0,0 +1,50 @@ +"""PawTrack agentic blueprint with optional prebuilt-map relocalization. + +Install this next to ``pawtrack_agentic.py`` in the DimOS source tree, then +regenerate ``all_blueprints``: + + cp integration/pawtrack_agentic_relocalization.py \ + "$DIMOS_HOME/dimos/robot/unitree/go2/blueprints/agentic/" + pytest -o addopts="" \ + "$DIMOS_HOME/dimos/robot/test_all_blueprints_generation.py" + +Run with a prebuilt map: + + PYTHONPATH=src dimos run pawtrack-agentic-relocalization \ + --robot-ip \ + -o relocalizationmodule.map_file= + +This intentionally keeps the existing ``pawtrack-agentic`` blueprint +unchanged. +Use this blueprint only when you want ``subject_map_pose`` in addition to the +always-on ``subject_world_pose``. +""" + +from __future__ import annotations + +import os + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.core.coordination.blueprints import autoconnect +from dimos.mapping.relocalization.module import RelocalizationModule +from dimos.robot.unitree.go2.blueprints.agentic._common_agentic import ( + _common_agentic, +) +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial import ( + unitree_go2_spatial, +) + +from pawtrack.skill_container import PawTrackSkillContainer +from pawtrack.system_prompt import PAWTRACK_PROMPT + +_MODEL = os.environ.get("PAWTRACK_MODEL", "openai:gpt-5.1") + +pawtrack_agentic_relocalization = autoconnect( + unitree_go2_spatial, + RelocalizationModule.blueprint(), + McpServer.blueprint(), + McpClient.blueprint(model=_MODEL, system_prompt=PAWTRACK_PROMPT), + _common_agentic, + PawTrackSkillContainer.blueprint(), +) diff --git a/hackathon/pawtrack/pyproject.toml b/hackathon/pawtrack/pyproject.toml new file mode 100644 index 0000000000..171bf8af8d --- /dev/null +++ b/hackathon/pawtrack/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools>=70", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "pawtrack" +version = "0.1.0" +description = "PawTrack: a Unitree Go2 that finds and tracks a described subject." +requires-python = ">=3.10" +dependencies = [] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +pythonpath = ["src"] +addopts = "-q" + diff --git a/hackathon/pawtrack/scripts/go2_exploration.py b/hackathon/pawtrack/scripts/go2_exploration.py new file mode 100644 index 0000000000..cd5b54da59 --- /dev/null +++ b/hackathon/pawtrack/scripts/go2_exploration.py @@ -0,0 +1,95 @@ +"""Send DimOS Go2 frontier-exploration LCM commands. + +Run this while a `unitree-go2` or `unitree-go2-memory` stack is already +running. The stack's `WavefrontFrontierExplorer` subscribes to these Boolean +command topics: + +- `/explore_cmd`: start autonomous frontier exploration. +- `/stop_explore_cmd`: stop exploration and publish the current pose as a goal. + +Examples: + python scripts/go2_exploration.py start + python scripts/go2_exploration.py stop +""" + +from __future__ import annotations + +import argparse +import time +from typing import Literal + +from dimos.core.transport import LCMTransport +from dimos_lcm.std_msgs import Bool + + +Action = Literal["start", "stop"] + + +_TOPICS: dict[Action, str] = { + "start": "/explore_cmd", + "stop": "/stop_explore_cmd", +} + + +def send_exploration_command( + action: Action, + repeats: int, + interval_s: float, +) -> None: + """Publish a frontier-exploration command over LCM. + + Args: + action: `start` begins exploration; `stop` ends it. + repeats: Number of times to publish the command. + interval_s: Delay between repeated publishes. + + Raises: + ValueError: If repeat or interval settings are invalid. + """ + if repeats < 1: + raise ValueError("repeats must be at least 1") + if interval_s < 0.0: + raise ValueError("interval_s must be non-negative") + + topic = _TOPICS[action] + transport = LCMTransport(topic, Bool) + message = Bool(data=True) + for index in range(repeats): + transport.broadcast(None, message) + if index + 1 < repeats and interval_s > 0.0: + time.sleep(interval_s) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Start or stop DimOS Go2 autonomous frontier exploration.", + ) + parser.add_argument( + "action", + choices=sorted(_TOPICS), + help="Exploration command to send.", + ) + parser.add_argument( + "--repeats", + type=int, + default=3, + help="Publish count. Repeating makes startup timing less brittle.", + ) + parser.add_argument( + "--interval-s", + type=float, + default=0.2, + help="Seconds between repeated publishes.", + ) + return parser.parse_args() + + +def main() -> None: + args = _parse_args() + send_exploration_command(args.action, args.repeats, args.interval_s) + verb = "Started" if args.action == "start" else "Stopped" + print(f"{verb} autonomous frontier exploration.") + + +if __name__ == "__main__": + main() diff --git a/hackathon/pawtrack/scripts/run_greeter_sim.sh b/hackathon/pawtrack/scripts/run_greeter_sim.sh new file mode 100755 index 0000000000..aa49f2f2fc --- /dev/null +++ b/hackathon/pawtrack/scripts/run_greeter_sim.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# +# Run the greeter end-to-end in the MuJoCo office sim. +# +# Encodes the sim gotchas we hit the hard way: +# - MuJoCo odom/TF/sensors are gated on a display, and MUST NOT use EGL. +# - the Clash proxy breaks DimOS signaling + the MCP httpx client. +# - the perception models are already cached, so run HuggingFace offline +# (a live revalidation times out behind the CN network/proxy). +# +# The office scene provides chairs and a person mesh (the old red ball was +# removed). Plant the person and start the loop from a second shell: +# +# PYTHONPATH=src python scripts/sim_person_pose.py --x 2.5 --y 0.0 +# dimos mcp call start_greeting # begin wandering + greeting +# dimos mcp call greeter_status # current phase +# dimos mcp call stop_greeting +# +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +export DIMOS_HOME="${DIMOS_HOME:-$HOME/sam/dimos}" +export DIMOS_VENV="${DIMOS_VENV:-$HOME/dimos-env}" +export PYTHONPATH="$REPO_ROOT/src:$DIMOS_HOME:${PYTHONPATH:-}" + +# Sim sensors/odom need a display; do not force EGL (it stalls the sensor loop). +export DISPLAY="${DISPLAY:-:1}" +unset MUJOCO_GL || true + +# The proxy breaks DimOS networking; perception models are cached, so go offline. +unset HTTP_PROXY HTTPS_PROXY http_proxy https_proxy all_proxy ALL_PROXY || true +export HF_HUB_OFFLINE=1 TRANSFORMERS_OFFLINE=1 + +# The autonomous loop does not use the agent LLM, but the stack expects a model. +export PAWTRACK_MODEL="${PAWTRACK_MODEL:-openai:deepseek-chat}" + +# The sim person mesh stands rather than sits, so target "a person" in sim. +# Override with GREETER_SUBJECT if your scene differs. +GREETER_SUBJECT="${GREETER_SUBJECT:-a person}" + +# shellcheck disable=SC1091 +source "$DIMOS_VENV/bin/activate" +cd "$DIMOS_HOME" + +exec dimos --simulation run greeter-agentic \ + -o "greeterskillcontainer.description=${GREETER_SUBJECT}" diff --git a/hackathon/pawtrack/scripts/run_pawtrack.py b/hackathon/pawtrack/scripts/run_pawtrack.py new file mode 100644 index 0000000000..c70f6fee49 --- /dev/null +++ b/hackathon/pawtrack/scripts/run_pawtrack.py @@ -0,0 +1,116 @@ +"""Launch the PawTrack subject-tracking agentic blueprint. + +PawTrack is not registered in DimOS ``all_blueprints``, so it ships its own +launcher. Modes: + +- default (off-robot): ``McpServer`` + ``McpClient`` + the subject-tracking + skill container. Enough to inspect tool discovery and call tools; no frames. +- ``--camera``: also run a webcam ``CameraModule`` and the Rerun viewer, so the + whole VLM-acquire + EdgeTAM-track pipeline runs end to end with no robot -- + point a laptop camera at the subject and describe it. +- ``--source PATH``: same end-to-end test but fed from an image or video file + instead of a webcam -- fully hardware-free (good for CI / a teammate). +- ``--robot``: the full ``unitree_go2_agentic`` stack + the tracker; the robot + supplies the camera frames and viewer. + +Describe the subject from the DimOS CLI (no robot needed): + dimos mcp call track_subject --arg description="a person sitting on a chair" + dimos agent-send "track the person sitting on the chair" # via the agent + +Run (in the venv), e.g. (the first two need no robot): + PYTHONPATH=src python scripts/run_pawtrack.py --source room.mp4 + PYTHONPATH=src python scripts/run_pawtrack.py --camera + PYTHONPATH=src python scripts/run_pawtrack.py --robot + +Diagnostics: ``subject_status`` is a JSON LCM stream; ``debug_image`` is the +annotated camera frame shown in Rerun (green box when tracking, red while +searching). + +Model defaults to ``openai:gpt-5.1`` (needs ``OPENAI_API_KEY``); override with +``PAWTRACK_MODEL`` (e.g. ``openai:deepseek-chat`` with ``OPENAI_BASE_URL``). +""" + +from __future__ import annotations + +import argparse +import importlib +import os + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.core.coordination.blueprints import autoconnect +from dimos.core.coordination.module_coordinator import ModuleCoordinator +from dimos.core.global_config import global_config +from dimos.hardware.sensors.camera.module import CameraModule +from dimos.visualization.vis_module import vis_module + +from pawtrack.image_source import FileImageSource +from pawtrack.skill_container import PawTrackSkillContainer +from pawtrack.system_prompt import PAWTRACK_PROMPT + + +def build_blueprint( + robot: bool = False, camera: bool = False, source: str | None = None +): + """Compose the PawTrack subject-tracking blueprint. + + Args: + robot: Run on the full unitree_go2_agentic stack; the robot supplies + camera frames and the viewer. + camera: Off-robot -- add a webcam source and the Rerun viewer. + source: Off-robot -- feed frames from this image/video file instead of + a webcam (takes precedence over ``camera``). + + Returns: + The composed blueprint ready for ModuleCoordinator.build. + """ + container = PawTrackSkillContainer.blueprint() + if robot: + base = importlib.import_module( + "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic" + ).unitree_go2_agentic + # The agentic base already bundles the MCP server + agent, so we just + # add our skill container (no second McpClient). The prompt-configured + # agent lives in the registered `pawtrack-agentic` blueprint; this + # ``--robot`` mode is a quick on-robot test of the standalone launcher. + return autoconnect(base, container) + model = os.environ.get("PAWTRACK_MODEL", "openai:gpt-5.1") + agent = McpClient.blueprint(model=model, system_prompt=PAWTRACK_PROMPT) + parts = [McpServer.blueprint(), container, agent] + if source: + parts.append(FileImageSource.blueprint(path=source)) + parts.append(vis_module(viewer_backend=global_config.viewer)) + elif camera: + parts.append(CameraModule.blueprint()) + parts.append(vis_module(viewer_backend=global_config.viewer)) + return autoconnect(*parts) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Run the PawTrack agent.") + parser.add_argument( + "--robot", + action="store_true", + help="Run on the full Unitree Go2 agentic stack (venue mode).", + ) + parser.add_argument( + "--camera", + action="store_true", + help="Off-robot: add a webcam source + Rerun viewer for an " + "end-to-end test with no robot.", + ) + parser.add_argument( + "--source", + default=None, + help="Off-robot: feed an image/video file instead of a webcam.", + ) + args = parser.parse_args() + ModuleCoordinator.build( + build_blueprint( + robot=args.robot, camera=args.camera, source=args.source + ) + ).loop() + + +if __name__ == "__main__": + main() diff --git a/hackathon/pawtrack/scripts/sim_person_pose.py b/hackathon/pawtrack/scripts/sim_person_pose.py new file mode 100644 index 0000000000..4e9949f444 --- /dev/null +++ b/hackathon/pawtrack/scripts/sim_person_pose.py @@ -0,0 +1,65 @@ +"""Plant the MuJoCo sim person at a fixed spot for the greeter demo. + +The office sim injects a ``person`` mocap body (``jeong_seun_34`` mesh, added by +``dimos.simulation.mujoco.model._add_person_object``) that +``PersonPositionController`` drives from the ``/person_pose`` LCM topic. The +person-follow demo walks it along a track; the greeter instead wants it standing +still where the dog can find it. + +This broadcasts a fixed pose, repeatedly, so the person stays put until you stop +it (Ctrl-C). The simulation must already be running to receive the pose. + + # terminal 1: the greeter sim (see run_greeter_sim.sh) + # terminal 2: plant the person ~2.5 m ahead, facing back toward the dog + PYTHONPATH=src python scripts/sim_person_pose.py --x 2.5 --y 0.0 +""" + +from __future__ import annotations + +import argparse +import math +import time + +from dimos.core.transport import LCMTransport +from dimos.msgs.geometry_msgs.Pose import Pose + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Plant the simulation person at a fixed pose." + ) + parser.add_argument("--x", type=float, default=2.5, help="World x (m).") + parser.add_argument("--y", type=float, default=0.0, help="World y (m).") + parser.add_argument( + "--yaw", type=float, default=math.pi, + help="Heading (rad); default faces back toward the spawn origin.", + ) + parser.add_argument( + "--rate-hz", type=float, default=2.0, + help="Re-broadcast rate so the pose persists across the sim loop.", + ) + args = parser.parse_args() + + half_yaw = args.yaw / 2.0 + pose = Pose( + position=[args.x, args.y, 0.0], + orientation=[0.0, 0.0, math.sin(half_yaw), math.cos(half_yaw)], + ) + period = 1.0 / args.rate_hz if args.rate_hz > 0 else 0.5 + transport: LCMTransport[Pose] = LCMTransport("/person_pose", Pose) + print( + f"Planting sim person at ({args.x}, {args.y}), yaw {args.yaw:.2f} rad. " + "Ctrl-C to stop." + ) + try: + while True: + transport.broadcast(None, pose) + time.sleep(period) + except KeyboardInterrupt: + pass + finally: + transport.stop() + + +if __name__ == "__main__": + main() diff --git a/hackathon/pawtrack/src/pawtrack/__init__.py b/hackathon/pawtrack/src/pawtrack/__init__.py new file mode 100644 index 0000000000..6ffca5d039 --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/__init__.py @@ -0,0 +1,25 @@ +"""PawTrack -- pure helpers for the Go2 subject-tracking demo. + +Only dependency-free modules are exported here, so importing :mod:`pawtrack` +stays light. The DimOS glue lives in :mod:`pawtrack.skill_container`, +imported directly by the launcher and not re-exported. +""" + +from __future__ import annotations + +from pawtrack.ground_raycast import CameraIntrinsics, pixel_to_ground_point +from pawtrack.track_state import ( + TrackMetrics, + VisualObservation, + ground_contact_pixel, + visual_metrics, +) + +__all__ = [ + "CameraIntrinsics", + "TrackMetrics", + "VisualObservation", + "ground_contact_pixel", + "pixel_to_ground_point", + "visual_metrics", +] diff --git a/hackathon/pawtrack/src/pawtrack/approach_geometry.py b/hackathon/pawtrack/src/pawtrack/approach_geometry.py new file mode 100644 index 0000000000..98a4ae434a --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/approach_geometry.py @@ -0,0 +1,164 @@ +"""Pure approach geometry for the greeter (no DimOS). + +Small velocity helpers the greeter container composes with the phase machine to +drive toward a person and hold a polite standoff: + +- :func:`hold_distance_vx` -- forward/back speed that settles at the standoff + distance (deadbanded, backs off if too close). +- :func:`centering_yaw` -- yaw rate that turns to keep the subject centered. +- :func:`approach_velocity` -- the two combined for the approach drive. + +The Go2 ``cmd_vel`` is a body twist ``(vx forward, vy strafe, wz yaw)``; the +greeter uses only ``vx`` and ``wz`` (no orbiting). Pure: stdlib only, unit +tested. +""" + +from __future__ import annotations + +import math + + +def _clamp(value: float, low: float, high: float) -> float: + return max(low, min(value, high)) + + +def _wrap_angle(angle_rad: float) -> float: + """Wrap an angle to ``(-pi, pi]``.""" + return (angle_rad + math.pi) % (2.0 * math.pi) - math.pi + + +def centering_yaw( + image_error_x: float, turn_gain: float, max_yaw_radps: float +) -> float: + """Yaw rate that turns the robot to recenter the subject. + + Args: + image_error_x: Normalized horizontal centering error in ``[-1, 1]``. + turn_gain: Proportional gain from error to yaw rate. + max_yaw_radps: Symmetric clamp on the yaw command. + + Returns: + Yaw rate (rad/s); positive turns toward decreasing ``image_error_x``. + """ + return _clamp(-turn_gain * image_error_x, -max_yaw_radps, max_yaw_radps) + + +def hold_distance_vx( + distance_m: float, + standoff_m: float, + *, + deadband_m: float = 0.15, + forward_gain: float = 0.8, + max_forward_mps: float = 0.5, + max_reverse_mps: float = 0.3, +) -> float: + """Forward/back speed that holds a standoff distance, with a deadband. + + Drives forward when too far and gently backs up when too close, so the robot + keeps a safe gap (accounting for its own size) even if it overshoots toward + the subject. The deadband is the key to *not* oscillating in/out: inside + ``standoff_m +/- deadband_m`` the command is exactly zero, so once the robot + is roughly at the standoff it holds still instead of hunting back and forth. + + Args: + distance_m: Floor distance to the subject. + standoff_m: Target polite distance to hold. + deadband_m: Half-width of the no-correction band around the standoff. + forward_gain: Proportional gain from distance error to speed. + max_forward_mps: Clamp on driving forward (too far). + max_reverse_mps: Clamp on backing up (too close). + + Returns: + Forward speed (m/s); positive = forward, negative = back up, 0 in band. + """ + error = distance_m - standoff_m + if abs(error) <= deadband_m: + return 0.0 + return _clamp(forward_gain * error, -max_reverse_mps, max_forward_mps) + + +def approach_velocity( + distance_m: float, + standoff_m: float, + image_error_x: float, + *, + deadband_m: float = 0.15, + forward_gain: float = 0.8, + turn_gain: float = 1.0, + max_forward_mps: float = 0.5, + max_reverse_mps: float = 0.3, + max_yaw_radps: float = 0.8, +) -> tuple[float, float]: + """Velocity to hold the standoff distance while centering the subject. + + Forward/back from :func:`hold_distance_vx` (deadbanded, so it settles at the + standoff without oscillating and backs off if it gets too close) plus a yaw + to recenter the subject. + + Args: + distance_m: Floor distance to the subject. + standoff_m: Target polite distance to hold. + image_error_x: Normalized horizontal centering error in ``[-1, 1]``. + deadband_m: No-correction band around the standoff. + forward_gain: Gain from distance error to forward speed. + turn_gain: Gain from centering error to yaw rate. + max_forward_mps: Clamp on the forward command. + max_reverse_mps: Clamp on backing up. + max_yaw_radps: Symmetric clamp on the yaw command. + + Returns: + ``(vx, wz)`` to publish this tick. + """ + vx = hold_distance_vx( + distance_m, standoff_m, deadband_m=deadband_m, forward_gain=forward_gain, + max_forward_mps=max_forward_mps, max_reverse_mps=max_reverse_mps) + wz = centering_yaw(image_error_x, turn_gain, max_yaw_radps) + return (vx, wz) + + +def drive_to_position( + robot_xy: tuple[float, float], + robot_yaw: float, + target_xy: tuple[float, float], + standoff_m: float, + *, + forward_gain: float = 0.8, + turn_gain: float = 1.0, + max_forward_mps: float = 0.5, + max_yaw_radps: float = 0.8, + heading_tol_rad: float = 0.5, +) -> tuple[float, float]: + """Body twist to drive from an odom pose toward ``target_xy``, to the standoff. + + Yaws to face the target and drives forward only when roughly facing it + (within ``heading_tol_rad``) and still beyond the standoff, so it turns + first rather than lunging sideways. Pure geometry over the robot's odom + pose -- used to reach a subject's last-known floor position when the visual + tracker has dropped it (dead reckoning), so a brief loss does not abandon + the engagement. + + Args: + robot_xy: Robot ``(x, y)`` in the world/odom frame. + robot_yaw: Robot heading (rad) in the same frame. + target_xy: Target ``(x, y)`` in the same frame. + standoff_m: Distance to stop short of the target. + forward_gain: Proportional gain from range error to forward speed. + turn_gain: Proportional gain from heading error to yaw rate. + max_forward_mps: Clamp on the forward command. + max_yaw_radps: Symmetric clamp on the yaw command. + heading_tol_rad: Only drive forward when the heading error is within + this (else turn in place first). + + Returns: + ``(vx, wz)`` to publish this tick. + """ + dx = target_xy[0] - robot_xy[0] + dy = target_xy[1] - robot_xy[1] + range_m = math.hypot(dx, dy) + heading_error = _wrap_angle(math.atan2(dy, dx) - robot_yaw) + wz = _clamp(turn_gain * heading_error, -max_yaw_radps, max_yaw_radps) + if range_m > standoff_m and abs(heading_error) < heading_tol_rad: + vx = _clamp(forward_gain * (range_m - standoff_m), 0.0, max_forward_mps) + else: + vx = 0.0 + return (vx, wz) diff --git a/hackathon/pawtrack/src/pawtrack/greeter_container.py b/hackathon/pawtrack/src/pawtrack/greeter_container.py new file mode 100644 index 0000000000..e0244400a9 --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/greeter_container.py @@ -0,0 +1,1286 @@ +"""DimOS glue for the greeter loop: wander -> identify -> select -> navigate +-> greet. + +This container orchestrates the loop but keeps every step a separable seam, so a +more capable module can later replace any one of them: + +- wander -> DimOS ``PatrollingModule`` via the injected ``PatrollingModuleSpec`` + (``start_patrol`` / ``stop_patrol`` / ``is_patrolling``). +- identify -> :func:`pawtrack.identify.detect_all` (multi-box VLM). +- select -> :func:`pawtrack.visited_registry.select_target` over a + :class:`pawtrack.visited_registry.VisitedRegistry` (random, not greeted in the + last ``revisit_forget_s`` seconds). +- navigate (approach) -> :func:`pawtrack.approach_geometry.approach_velocity` + over the geometric ground distance from the raycast (the seam a real planner + would replace). +- phase decisions -> the pure :class:`pawtrack.greeter_state.GreeterMachine`. +- greet -> halt, then wave once with ``Hello`` (1016) whichever way the person + faces; afterward recover with ``RecoveryStand`` (1006) + ``BalanceStand`` + (1002) so it can walk again -- via the injected GO2 connection (a no-op + off-robot). There is no orbit-to-front step: any seated person reached at a + safe standoff gets a wave. + +The per-tick logic lives in :meth:`_tick`, callable synchronously (the loop +thread just calls it on a clock), so the orchestration is unit-testable with +fakes for the models, the patrol spec, and the connection -- no GPU or robot. +""" + +from __future__ import annotations + +import json +import math +import random +import threading +import time +from typing import Any + +import cv2 +import numpy as np +from reactivex.disposable import Disposable + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.models.segmentation.edge_tam import EdgeTAMProcessor +from dimos.models.vl.base import VlModel +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Twist import Twist +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 +from dimos.navigation.patrolling.patrolling_module_spec import ( + PatrollingModuleSpec, +) +from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox +from dimos.perception.detection.type.detection3d.pointcloud import Detection3DPC +from dimos.robot.unitree.dimsim_connection import DimSimConnection +from dimos.robot.unitree.go2.connection import GO2Connection +from dimos.robot.unitree.go2.connection_spec import GO2ConnectionSpec +from dimos.robot.unitree.mujoco_connection import MujocoConnection +from dimos.utils.logging_config import setup_logger +from unitree_webrtc_connect.constants import RTC_TOPIC, SPORT_CMD + +from pawtrack.approach_geometry import ( + approach_velocity, + centering_yaw, + drive_to_position, +) +from pawtrack.greeter_state import ( + GreeterMachine, + GreeterObservation, + GreeterParams, + GreeterSnapshot, +) +from pawtrack.ground_raycast import CameraIntrinsics, pixel_to_ground_point +from pawtrack.identify import detect_all, detect_facing +from pawtrack.occupancy import is_near_obstacle +from pawtrack.qwen_china import QwenChinaVlModel +from pawtrack.track_state import clamp_bbox, ground_contact_pixel, valid_bbox +from pawtrack.visited_registry import Candidate, VisitedRegistry, select_target + +logger = setup_logger() + +_HELLO_API_ID = SPORT_CMD["Hello"] +_BALANCE_STAND_API_ID = SPORT_CMD["BalanceStand"] +_RECOVERY_STAND_API_ID = SPORT_CMD["RecoveryStand"] +_FINGER_HEART_API_ID = SPORT_CMD["FingerHeart"] +# api_id -> name, for readable sport-command tracing in the logs. +_SPORT_NAMES = {api_id: name for name, api_id in SPORT_CMD.items()} + + +class Config(ModuleConfig): + """Configuration for the greeter loop.""" + + description: str = "a person sitting on a chair" + loop_hz: float = 12.0 + scan_interval_s: float = 1.5 # how often to run the VLM while wandering + # When wander can see only already-greeted people, rotate in place to look + # for someone new: the patrol is usually goal-less and blocked by the person + # right ahead, so the dog would otherwise sit frozen facing a handled + # subject. Bounded so it hands back to the patrol instead of spinning forever + # (e.g. once everyone in view has been greeted). + wander_rescan_yaw_radps: float = 0.5 + wander_rescan_max_s: float = 8.0 + # Free-explore fallback. The coverage patrol goes silent once it has no goal + # (e.g. coverage exhausted), and is_patrolling() cannot tell us that, so the + # dog would sit frozen. Detect the wedge from odom -- no position change + # beyond stall_move_threshold_m for stall_timeout_s -- then take over and + # roam into new space (brief in-place reorient, then forward) until the dog + # has escaped roam_escape_distance_m from the stuck spot (or roam_max_s + # elapses), then hand back so the patrol can replan from the new location. + # Needs odom; with none it falls back to the rotate-only rescan and never + # drives blind. roam_forward_mps stays above the sim RL translation deadband + # (~0.2-0.3 m/s). + enable_roam_when_wedged: bool = True + stall_move_threshold_m: float = 0.3 + stall_timeout_s: float = 4.0 + roam_forward_mps: float = 0.35 + roam_yaw_radps: float = 0.6 + roam_turn_s: float = 1.5 + roam_escape_distance_m: float = 1.0 + roam_max_s: float = 6.0 + # Stop the loop (failing safe) after this many consecutive tick exceptions. + max_consecutive_errors: int = 5 + # Phase thresholds (fed to the pure GreeterMachine). + # 2.0 m standoff: the Go2's onboard obstacle avoidance halts the dog ~2 m + # from a person, so targeting 1.5 m left it stuck just outside greeting range + # (distance plateaued ~2.0-2.5 m and timed out). Hold + greet from ~2 m, which + # it can actually reach. + standoff_m: float = 2.0 + standoff_tolerance_m: float = 0.3 + # Hard floor on the greeting distance -- the dog's footprint + Hello reach. + # Relaxed to 0.6 m: seated people (especially at desks) tend to end up close, + # and the window would otherwise reject them. Greet window is + # [min_safe_distance_m, standoff_m + standoff_tolerance_m] = [0.6, 2.3] m. + min_safe_distance_m: float = 0.6 + engage_timeout_s: float = 10.0 # bail to patrol if an engagement stalls + # On track loss, dead-reckon to the subject's last-known floor position + # (odom) rather than abandoning. Only drive forward when the heading error to + # it is under this; on arrival, a candidate within redetect_radius_m of the + # last-known spot counts as re-acquiring the same subject (else: it moved / + # was a bad fix -> give up). + deadreckon_heading_tol_rad: float = 0.5 + redetect_radius_m: float = 1.0 + greet_duration_s: float = 4.0 + cooldown_duration_s: float = 6.0 + # After the wave, if the subject faces the camera, reward them with a + # FingerHeart; this is how long to let that gesture play before recovering. + heart_duration_s: float = 3.0 + # Settle time between RecoveryStand and BalanceStand in a one-shot wave. + recover_settle_s: float = 1.5 + # A sport command (Hello / BalanceStand / RecoveryStand) goes out over the + # WebRTC data channel, which raises "Data channel is not open" whenever the + # channel briefly drops mid-session. Retry a bounded number of times to + # bridge a transient blip; a persistent failure is logged and skipped rather + # than crashing the control loop. + sport_retry_attempts: int = 3 + sport_retry_delay_s: float = 0.3 + # Treat a candidate within this radius of a greeted position as the same + # subject (chair footprint + tracker noise), so it is not greeted twice. + revisit_radius_m: float = 1.0 + # How long a greeted subject is skipped before it may be greeted again. The + # demo stays lively instead of going quiet once everyone has been greeted + # once; set to None to greet each subject only once per run. + revisit_forget_s: float | None = 60.0 + # Approach distance control: hold the standoff from the geometric ground + # distance, yaw to recenter. The deadband holds the dog steady at the standoff + # without hunting in/out; the bounded reverse backs it off if it gets too + # close. + approach_forward_gain: float = 0.8 + approach_turn_gain: float = 1.0 + approach_max_forward_mps: float = 0.5 + approach_max_reverse_mps: float = 0.3 + standoff_deadband_m: float = 0.2 + approach_max_yaw_radps: float = 0.8 + # Ground raycast (subject floor position + distance). + camera_info: CameraInfo | None = None # auto-resolved if None + world_frame_id: str = "world" + camera_optical_frame_id: str = "camera_optical" + # How often (s) to log a lidar-path diagnostic line, so a fallback run shows + # which boundary failed (no cloud / empty / no TF / no points in the box). + lidar_log_interval_s: float = 3.0 + world_floor_z_m: float = 0.0 + tf_tolerance_s: float = 1.0 + # Reflection filter: drop a candidate whose floor position lands on (or + # within this clearance of) a mapped wall in the navigation costmap. A person + # reflected in a glass wall is detected by the VLM and ranged by the lidar at + # the glass surface, so without this the dog walks over and waves at the + # reflection; a real subject stands in free space. Fails open -- no costmap + # means no filtering -- so a fresh, unmapped run greets exactly as before. + reject_on_mapped_obstacle: bool = True + obstacle_clearance_m: float = 0.3 + obstacle_cost_threshold: int = 50 # 0 free .. 100 lethal; raise to be lenient + # Distance sanity bound: drop a candidate whose floor range from the robot + # exceeds this. A box near the image horizon raycasts to a near-infinite floor + # point, so without this the dog commits to phantom targets metres (even tens + # of metres) away and walks at empty space. Set generously (30 m) so the dog + # can still notice and head toward distant real people while the bound only + # culls the clearly-impossible horizon artifacts. + max_subject_distance_m: float = 30.0 + + +class GreeterSkillContainer(Module): + """Autonomous greeter: wander, find a seated person, face them, say hi.""" + + config: Config + + color_image: In[Image] + lidar: In[PointCloud2] # world-frame lidar cloud for subject ranging + odom: In[PoseStamped] # robot world pose, for dead reckoning to a lost subject + global_costmap: In[OccupancyGrid] # nav costmap; rejects on-wall candidates + cmd_vel: Out[Twist] + debug_image: Out[Image] # annotated frame: bbox + phase/distance/front + greeter_phase: Out[str] # rich JSON trace: phase + why (distinct from the + # ``greeter_status`` skill below -- a port and a skill must not share a name) + subject_world_pose: Out[PoseStamped] # where it thinks the subject is (3D) + # Injected by the blueprint: the patrol provides the wander; the GO2 + # connection issues the greeting (None off-robot -> greeting is a no-op). + _patrolling_module_spec: PatrollingModuleSpec + _connection: GO2ConnectionSpec | None = None + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._lock = threading.RLock() + self._should_stop = threading.Event() + self._thread: threading.Thread | None = None + self._latest_image: Image | None = None + self._latest_pointcloud: PointCloud2 | None = None + self._latest_odom: PoseStamped | None = None + self._latest_costmap: OccupancyGrid | None = None + self._last_lidar_log_s = 0.0 + # Horizontal bbox error of the last successful track, for re-centering + # the search yaw when the tracker briefly drops the subject. + self._last_error_x = 0.0 + self._machine = GreeterMachine(GreeterParams( + standoff_m=self.config.standoff_m, + standoff_tolerance_m=self.config.standoff_tolerance_m, + min_safe_distance_m=self.config.min_safe_distance_m, + engage_timeout_s=self.config.engage_timeout_s, + greet_duration_s=self.config.greet_duration_s, + cooldown_duration_s=self.config.cooldown_duration_s, + )) + self._registry = VisitedRegistry( + self.config.revisit_radius_m, self.config.revisit_forget_s) + self._rng = random.Random() + self._vl_model: VlModel | None = None + self._tracker: EdgeTAMProcessor | None = None + # What to find / approach / wave at. Defaults to the configured subject; + # start_greeting(target=...) overrides it per run (e.g. "a chair" in sim). + self._description = self.config.description + self._chosen: Candidate | None = None + self._last_position: tuple[float, float] | None = None + self._last_scan_s = 0.0 + # How many detections the last scan dropped (too far / on a wall), so + # wander can rotate away from a wall/reflection instead of freezing. + self._last_scan_dropped = 0 + # Rescan state: rotating in place to look past already-greeted people. + self._rescanning = False + self._rescan_started_s = 0.0 + # Free-explore (roam-when-wedged) state, driven by odom stall detection. + self._roaming = False + self._roam_anchor_xy: tuple[float, float] | None = None + self._roam_progress_s = 0.0 + self._roam_start_xy: tuple[float, float] = (0.0, 0.0) + self._roam_started_s = 0.0 + self._roam_turn_dir = 1.0 + self._last_snapshot: GreeterSnapshot | None = None + self._last_trace: str | None = None # last rich greeter_phase JSON + camera_info = self._resolve_camera_info() + self._camera_info = camera_info # for the lidar pointcloud projection + self._intrinsics = ( + CameraIntrinsics.from_k(list(camera_info.K)) + if camera_info is not None else None + ) + + def _resolve_camera_info(self) -> CameraInfo | None: + """Pick camera intrinsics: explicit config, else sim/Go2 statics.""" + camera_info = self.config.camera_info + if camera_info is not None: + return camera_info + simulation = self.config.g.simulation + if simulation == "mujoco": + return MujocoConnection.camera_info_static + if simulation == "dimsim": + return DimSimConnection.camera_info_static + return GO2Connection.camera_info_static + + @rpc + def start(self) -> None: + super().start() + self.register_disposable( + Disposable(self.color_image.subscribe(self._on_color_image)) + ) + # The lidar input is optional: when no producer is wired (e.g. a sim + # without lidar), this simply never fires and _locate falls back to the + # monocular ground raycast. + self.register_disposable( + Disposable(self.lidar.subscribe(self._on_pointcloud)) + ) + # Odom drives the dead-reckon-to-last-known fallback on track loss; if no + # producer is wired it never fires and the loss falls back to an in-place + # re-find search. + self.register_disposable( + Disposable(self.odom.subscribe(self._on_odom)) + ) + # The costmap feeds the reflection filter (drop candidates on a mapped + # wall). Optional: if no mapper is wired the filter simply never fires. + self.register_disposable( + Disposable(self.global_costmap.subscribe(self._on_costmap)) + ) + # Trace the connection up front: if it is None, every sport gesture + # (Hello / FingerHeart / Recovery / Balance) silently no-ops -- the first + # thing to check when "it greets but does not wave". + conn = self._connection + logger.info( + "greeter: GO2 connection = %s", + type(conn).__name__ if conn is not None + else "NONE -- sport gestures will no-op") + + @rpc + def stop(self) -> None: + self._stop_loop() + # DimOS may tear the module down without going through stop_greeting, so + # halt here too -- otherwise the last cmd_vel or an active patrol persists. + self._halt() + with self._lock: + tracker = self._tracker + self._tracker = None + vl_model = self._vl_model + self._vl_model = None + if tracker is not None: + tracker.stop() + if vl_model is not None: + vl_model.stop() + super().stop() + + # -- Skills -- + + @skill + def start_greeting(self, target: str | None = None) -> str: + """Start the autonomous greeter loop: wander, find the target, walk over, + and wave hello (whichever way it faces). Call stop_greeting to halt. + + Args: + target: What to look for, approach, and wave at, as a short visual + description (e.g. "a person sitting on a chair", "a person", "a + chair"). Defaults to the configured subject when omitted. Handy + in simulation, where there may be no seated person -- point it at + "a chair" or "a person" to exercise the loop. + """ + with self._lock: + if self._thread is not None and self._thread.is_alive(): + return "Greeter is already running." + # Per-run target: an explicit one overrides the configured default + # (cleared back to the default when omitted). Set before the thread + # starts so the loop's scan sees it. + self._description = target or self.config.description + # Start every run from a clean wander state, so a stop during + # approach/greet/cooldown cannot resume mid-engagement. The + # visited registry is intentionally kept across restarts. + self._reset_engagement() + self._should_stop.clear() + self._thread = threading.Thread( + target=self._run_loop, name="greeter", daemon=True + ) + self._thread.start() + return f"Greeter started. Looking for {self._description!r} to greet." + + @skill + def stop_greeting(self) -> str: + """Stop the greeter loop and halt the robot.""" + self._stop_loop() + self._halt() # zero velocity + stop the patrol (it owns motion in wander) + self._publish_status(GreeterSnapshot( + state="wander", message="Greeter stopped; idle.")) + return "Greeter stopped." + + @skill + def greeter_status(self) -> str: + """Report the greeter's current phase and why, as JSON. + + Includes the phase, the per-tick perception driving it (subject visible, + distance), whether the patrol is running, where it thinks the subject is, + and how many have been greeted -- enough to tell why it is or is not + progressing. + """ + if self._last_trace is None: + return GreeterSnapshot( + state="wander", message="Idle; greeter not started." + ).to_json() + return self._last_trace + + @skill + def wave_hello(self) -> str: + """Make the dog wave hello right now: stop, stand still, play the Hello + gesture, then recover (RecoveryStand + BalanceStand) so it can walk again. + + If the autonomous greeter loop is running, this stops it first and waits + for the tick thread to exit, so nothing restarts the patrol or publishes + cmd_vel during the gesture -- call start_greeting to resume greeting + afterward. The same greeting the loop performs; handy to greet on command + or to verify the wave end-to-end. A no-op off-robot (no GO2 connection). + """ + # Stop the loop and JOIN the tick thread before moving on, so no tick can + # restart the patrol or drive cmd_vel while the gesture plays. Then stop + # the patrol and zero velocity. + self._stop_loop() + self._halt() + delivered = self._send_sport(_HELLO_API_ID) + if delivered: + time.sleep(self.config.greet_duration_s) # let the gesture play out + # Restore a clean stand + balance so it can walk again afterward. + self._recover_locomotion() + if not delivered: + return ( + "Tried to wave, but the Hello gesture did not go out (no GO2 " + "connection, or the data channel is not open)." + ) + return "Waved hello." + + # -- Loop -- + + def _run_loop(self) -> None: + period = 1.0 / self.config.loop_hz + next_time = time.monotonic() + errors = 0 + while not self._should_stop.is_set(): + next_time += period + try: + self._tick(time.monotonic()) + errors = 0 + except Exception: # noqa: BLE001 - thread-boundary guard + # Fail safe on every tick error (halt, do not keep the last + # motion command running), and give up after a persistent run of + # them rather than driving on a broken detector/TF/model/stream. + errors += 1 + logger.exception( + "greeter tick crashed (%d in a row)", errors) + self._halt() + if errors >= self.config.max_consecutive_errors: + self._set_error_status( + "Greeter stopped after repeated errors; see logs.") + break + now = time.monotonic() + sleep_for = next_time - now + if sleep_for > 0: + self._should_stop.wait(sleep_for) + else: + next_time = now + + def _tick(self, now: float) -> GreeterObservation: + """One synchronous loop step: observe + drive, advance the machine. + + Returns the observation it fed the machine (handy for tests). + """ + with self._lock: + image = self._latest_image + state = self._machine.state + if state == "wander": + acquired = self._wander_tick(image, now) + obs = GreeterObservation( + target_acquired=acquired, subject_visible=acquired) + elif state == "approach": + obs = self._engage_tick(image) + else: # greet / cooldown -- the sport gesture plays here + # Do NOT stream cmd_vel during the gesture. The dog was already + # halted on the greet edge (_on_enter -> _halt), and on the Go2, + # streaming cmd_vel keeps it in joystick/balance mode, which + # suppresses sport animations (Hello / FingerHeart) and the + # RecoveryStand -- a likely reason a *sent* gesture does not play. + obs = GreeterObservation() + + snapshot = self._machine.step(obs, now) + if snapshot.entered: + self._on_enter(snapshot.state, snapshot.reason, now) + self._publish_status(snapshot, obs) + return obs + + # -- Step: wander (DimOS patrol) + identify + select -- + + def _wander_tick(self, image: Image | None, now: float) -> bool: + """Roam (via the patrol), scan for a target, and break free if wedged. + + The patrol normally owns wander motion. But the patrol often has no goal + when the dog faces something it cannot act on -- everyone in view already + greeted, or every detection rejected by the filters (too far, or on a + mapped wall, e.g. a glass reflection) -- and the thing ahead is an + obstacle, so the dog freezes staring at it. In that case the greeter + rotates in place to rescan a new direction, then (bounded) hands back to + the patrol from the new heading. + """ + chosen: Candidate | None = None + candidates: list[Candidate] = [] + if (image is not None + and now - self._last_scan_s >= self.config.scan_interval_s): + self._last_scan_s = now + candidates = self._scan(image) + chosen = select_target(candidates, self._registry, self._rng, now) + # Show each scan in the debug video: what was detected and the pick. + scan_bbox = chosen.bbox if chosen else ( + candidates[0].bbox if candidates else None) + self._publish_debug( + image, scan_bbox, f"wander: {len(candidates)} cand") + # Lock the tracker first; only stop the patrol once that succeeds, so + # a failed init does not leave the robot idle with the patrol paused. + if chosen is not None and self._init_tracker(chosen.bbox, image): + self._patrolling_module_spec.stop_patrol() + self._rescanning = False + self._chosen = chosen + self._last_position = chosen.position + logger.info( + "greeter: engaging subject at (%.2f, %.2f) of %d " + "candidate(s)", + chosen.position[0], chosen.position[1], len(candidates)) + return True + if chosen is not None: + logger.warning( + "greeter: tracker init failed on the chosen subject") + # Rotate to a new heading when the view holds something the dog + # cannot act on -- people already greeted (kept candidates) OR + # detections all rejected as too-far/on-a-wall (a wall/reflection the + # filter dropped). Either way the patrol is usually goal-less here, so + # rotating finds a fresh direction instead of freezing. Stop once the + # view is genuinely empty so the patrol can explore from there. + saw_something = bool(candidates) or self._last_scan_dropped > 0 + if saw_something and chosen is None and not self._rescanning: + self._rescanning = True + self._rescan_started_s = now + elif not saw_something: + self._rescanning = False + # Don't spin forever (e.g. everyone in view is greeted): after a bounded + # rotation, hand back to the patrol to retry from the new heading. + if self._rescanning and ( + now - self._rescan_started_s >= self.config.wander_rescan_max_s): + self._rescanning = False + if self._rescanning: + if self._patrolling_module_spec.is_patrolling(): + self._patrolling_module_spec.stop_patrol() + self.cmd_vel.publish(self._rescan_twist()) + self._reset_roam(now) # rotating is activity; don't also roam now + return False + # Free-explore: when odom shows the dog is wedged (patrol goal-less and + # not moving), roam into new space, then hand back to the patrol. + roam = self._roam_tick(now) + if roam is not None: + if self._patrolling_module_spec.is_patrolling(): + self._patrolling_module_spec.stop_patrol() + self.cmd_vel.publish(roam) + elif not self._patrolling_module_spec.is_patrolling(): + self._patrolling_module_spec.start_patrol() + return False + + def _rescan_twist(self) -> Twist: + """In-place yaw to scan a new direction when wedged on visited people.""" + return Twist( + linear=Vector3(0.0, 0.0, 0.0), + angular=Vector3(0.0, 0.0, self.config.wander_rescan_yaw_radps)) + + def _reset_roam(self, now: float) -> None: + """Clear roam state (rescan takes over, the dog escaped, or a new run).""" + self._roaming = False + self._roam_anchor_xy = None + self._roam_progress_s = now + + def _roam_tick(self, now: float) -> Twist | None: + """Roam into new space when the dog is wedged; None when it is not. + + The coverage patrol cannot tell us it is goal-less, so the wedge is + detected from odom: no position change beyond ``stall_move_threshold_m`` + for ``stall_timeout_s``. On a wedge the dog takes over -- a brief in-place + reorient (``roam_turn_s``) so it does not drive straight back into + whatever blocked it, then forward at ``roam_forward_mps`` -- until it has + escaped ``roam_escape_distance_m`` from the stuck spot (or ``roam_max_s`` + elapses), then it stops roaming so the caller hands back to the patrol. + Returns None (no roam) when disabled, when odom is unavailable (never + drive blind), or while the dog is already making progress. + """ + if not self.config.enable_roam_when_wedged: + return None + with self._lock: + odom = self._latest_odom + if odom is None: + return None + xy = (odom.position.x, odom.position.y) + if self._roam_anchor_xy is None: + self._roam_anchor_xy = xy + self._roam_progress_s = now + if not self._roaming: + moved = math.hypot( + xy[0] - self._roam_anchor_xy[0], xy[1] - self._roam_anchor_xy[1]) + if moved > self.config.stall_move_threshold_m: + self._roam_anchor_xy = xy # progress -> reset the stall timer + self._roam_progress_s = now + return None + if now - self._roam_progress_s < self.config.stall_timeout_s: + return None + # Wedged: start roaming from here, in a random direction. + self._roaming = True + self._roam_start_xy = xy + self._roam_started_s = now + self._roam_turn_dir = self._rng.choice((-1.0, 1.0)) + escaped = math.hypot( + xy[0] - self._roam_start_xy[0], xy[1] - self._roam_start_xy[1]) + if (escaped > self.config.roam_escape_distance_m + or now - self._roam_started_s > self.config.roam_max_s): + self._reset_roam(now) + return None + # Reorient in place first, then drive forward (a slight same-way curve so + # repeated wedges sweep the area rather than retrace one line). + if now - self._roam_started_s < self.config.roam_turn_s: + return Twist( + linear=Vector3(0.0, 0.0, 0.0), + angular=Vector3( + 0.0, 0.0, self._roam_turn_dir * self.config.roam_yaw_radps)) + return Twist( + linear=Vector3(self.config.roam_forward_mps, 0.0, 0.0), + angular=Vector3( + 0.0, 0.0, + self._roam_turn_dir * self.config.roam_yaw_radps * 0.3)) + + def _scan(self, image: Image) -> list[Candidate]: + """Detect every described subject and locate each on the floor. + + Candidates whose floor position lands on a mapped wall are dropped: a + person reflected in a glass wall is detected by the VLM and ranged by + the lidar at the glass surface, so without this filter the dog walks + over and waves at the reflection. A real subject stands in free space. + """ + candidates: list[Candidate] = [] + n_too_far = 0 + n_on_wall = 0 + for raw in detect_all(self._get_vl_model(), image, self._description): + # Reject out-of-frame / degenerate boxes here, before one can be + # selected and stop the patrol only to fail tracker init later. + if not valid_bbox(list(raw), image.width, image.height): + continue + # Clamp to the frame BEFORE raycasting/storing: a box that spills off + # the edge would otherwise raycast an off-image bottom-center to a + # wrong floor position (and visited key). The tracker init clamps too, + # so the candidate position and the tracked box now agree. + box = clamp_bbox(raw, image.width, image.height) + located = self._locate(box, image) + if located is None: + continue + position, distance = located + # Distance sanity bound: a box near the horizon raycasts to a + # near-infinite floor point, so drop implausibly far "subjects" + # before engaging a phantom and walking at empty space. + if distance > self.config.max_subject_distance_m: + n_too_far += 1 + continue + if self._on_mapped_obstacle(position): + n_on_wall += 1 + continue + candidates.append(Candidate(position=position, bbox=box)) + # Record drops so wander can tell "empty room" (do nothing, let the + # patrol roam) from "saw things but rejected them all" (wedged facing a + # wall/reflection -> rotate to look elsewhere, since the candidate list + # being emptied would otherwise suppress the rescan). + self._last_scan_dropped = n_too_far + n_on_wall + if n_too_far or n_on_wall: + logger.info( + "greeter scan: dropped %d beyond %.0fm and %d on a mapped wall " + "(ranging artifacts / glass reflections)", + n_too_far, self.config.max_subject_distance_m, n_on_wall) + return candidates + + def _on_mapped_obstacle(self, position: tuple[float, float]) -> bool: + """Whether a candidate floor position lands on/near a mapped wall. + + Reads the latest global costmap (``world`` frame, same as the candidate) + and tests it with :func:`pawtrack.occupancy.is_near_obstacle`. Returns + False when the filter is disabled or no costmap has arrived, so a missing + map never suppresses greeting. + """ + if not self.config.reject_on_mapped_obstacle: + return False + with self._lock: + costmap = self._latest_costmap + if costmap is None: + return False + return is_near_obstacle( + costmap.grid, + costmap.resolution, + (costmap.origin.position.x, costmap.origin.position.y), + position, + clearance_m=self.config.obstacle_clearance_m, + cost_threshold=self.config.obstacle_cost_threshold, + ) + + # -- Step: navigate (approach) -- + + def _engage_tick(self, image: Image | None) -> GreeterObservation: + """Track the chosen subject and drive toward the standoff to wave. + + Greeting is unconditional once visible at a safe standoff (the machine + gates on distance, not facing) -- no orbit-to-front step. While the + tracker holds the subject this visually servos toward it (and keeps the + last-known position refreshed, so a *moving* subject is chased). On a + track loss it does not bail; it dead-reckons to the last-known position + and re-detects there (:meth:`_deadreckon_tick`). + """ + if image is None: + self.cmd_vel.publish(Twist.zero()) + return GreeterObservation(subject_visible=False) + detection = self._track(image) + if detection is None: + return self._deadreckon_tick(image) + x1, _, x2, _ = detection.bbox + self._last_error_x = ( + ((x1 + x2) / 2.0 - image.width / 2.0) / (image.width / 2.0)) + + located = self._locate(detection.bbox, image) + distance = None if located is None else located[1] + if located is not None: + self._last_position = located[0] # refresh dest from the live position + self._publish_world_pose(image, located[0]) # 3D marker in Rerun + + self.cmd_vel.publish( + self._approach_twist(detection.bbox, image.width, distance)) + dist_label = "?" if distance is None else f"{distance:.1f}m" + self._publish_debug(image, detection.bbox, f"approach d={dist_label}") + return GreeterObservation(subject_visible=True, distance_m=distance) + + def _deadreckon_tick(self, image: Image) -> GreeterObservation: + """Tracker lost the subject: drive to its last-known spot, re-detect there. + + Uses odom (not the camera) to navigate to the last-known floor position, + so a brief loss does not abandon the engagement. On arrival it re-runs + the detector: a match near the last-known spot re-acquires the subject + (it may have shifted); none means it moved off or was a bad fix, and the + machine gives up (``reached_destination`` with the subject still unseen). + Falls back to an in-place re-find yaw when no odom or last position is + available. + """ + with self._lock: + odom = self._latest_odom + if odom is None or self._last_position is None: + self.cmd_vel.publish(self._search_twist()) + self._publish_debug(image, None, f"searching: {self._description}") + return GreeterObservation(subject_visible=False) + robot_xy = (odom.position.x, odom.position.y) + range_m = math.hypot( + self._last_position[0] - robot_xy[0], + self._last_position[1] - robot_xy[1]) + limit = self.config.standoff_m + self.config.standoff_tolerance_m + if range_m > limit: + vx, wz = drive_to_position( + robot_xy, odom.yaw, self._last_position, self.config.standoff_m, + forward_gain=self.config.approach_forward_gain, + turn_gain=self.config.approach_turn_gain, + max_forward_mps=self.config.approach_max_forward_mps, + max_yaw_radps=self.config.approach_max_yaw_radps, + heading_tol_rad=self.config.deadreckon_heading_tol_rad) + self.cmd_vel.publish( + Twist(linear=Vector3(vx, 0.0, 0.0), + angular=Vector3(0.0, 0.0, wz))) + self._publish_debug( + image, None, f"to last seen d={range_m:.1f}m") + return GreeterObservation( + subject_visible=False, distance_m=range_m, + reached_destination=False) + # Arrived at the last-known spot: stop, and try to re-detect there. + self.cmd_vel.publish(Twist.zero()) + now = time.monotonic() + if now - self._last_scan_s < self.config.scan_interval_s: + # The re-scan (a VLM query) is rate-limited; hold and wait for it + # rather than declaring the subject gone before we have looked. + self._publish_debug(image, None, "arrived; re-checking") + return GreeterObservation( + subject_visible=False, distance_m=range_m, + reached_destination=False) + self._last_scan_s = now + position = self._reacquire_near_last(image) + if position is not None: + self._last_position = position + self._publish_world_pose(image, position) + self._publish_debug(image, None, "re-acquired at last seen") + return GreeterObservation(subject_visible=True, distance_m=range_m) + # Re-scanned and nothing near the last-known spot: it moved off or was a + # bad fix -- give up (the machine skips it on reached_destination). + self._publish_debug(image, None, "arrived, subject gone") + return GreeterObservation( + subject_visible=False, distance_m=range_m, reached_destination=True) + + def _reacquire_near_last(self, image: Image): + """Re-detect a subject near the last-known position; re-init the tracker. + + Runs the multi-box VLM detect and matches the candidate nearest the + last-known floor position. Returns that position (within + ``redetect_radius_m``) after re-seeding the tracker on its box, or None + if nothing matches close enough. + """ + candidates = self._scan(image) + if not candidates or self._last_position is None: + return None + nearest = min( + candidates, + key=lambda c: math.dist(c.position, self._last_position)) + if math.dist(nearest.position, self._last_position) > ( + self.config.redetect_radius_m): + return None + if not self._init_tracker(nearest.bbox, image): + return None + return nearest.position + + def _search_twist(self) -> Twist: + """Yaw toward the subject's last-seen side to bring it back into frame. + + On a brief track loss the subject has usually just slid out of the + camera's view as the dog turned to engage; rotating the way that would + re-centre its last-seen position recovers it, rather than freezing + (which strands an off-angle subject out of frame until the lost timeout + gives up). Pure yaw, no forward motion -- it does not drive blind. + """ + wz = centering_yaw( + self._last_error_x, self.config.approach_turn_gain, + self.config.approach_max_yaw_radps) + return Twist( + linear=Vector3(0.0, 0.0, 0.0), angular=Vector3(0.0, 0.0, wz)) + + def _approach_twist( + self, bbox, image_width: int, distance_m: float | None + ) -> Twist: + """Drive toward the subject to the standoff distance (the planner seam). + + Uses the geometric ground distance from the raycast plus bbox centering, + not a monocular width model: our target is a person *on a chair*, whose + tracked box is wider than the person-shoulder width a width-based + estimator assumes, which would stop the approach short. A real planner + would replace this method. + """ + x1, _, x2, _ = bbox + error_x = ((x1 + x2) / 2.0 - image_width / 2.0) / (image_width / 2.0) + if distance_m is None: + # No ground fix: face the subject but do not drive blind. + wz = centering_yaw( + error_x, self.config.approach_turn_gain, + self.config.approach_max_yaw_radps) + return Twist( + linear=Vector3(0.0, 0.0, 0.0), angular=Vector3(0.0, 0.0, wz)) + vx, wz = approach_velocity( + distance_m, self.config.standoff_m, error_x, + deadband_m=self.config.standoff_deadband_m, + forward_gain=self.config.approach_forward_gain, + turn_gain=self.config.approach_turn_gain, + max_forward_mps=self.config.approach_max_forward_mps, + max_reverse_mps=self.config.approach_max_reverse_mps, + max_yaw_radps=self.config.approach_max_yaw_radps, + ) + return Twist( + linear=Vector3(vx, 0.0, 0.0), angular=Vector3(0.0, 0.0, wz)) + + # -- Step: greet (on phase edges) -- + + def _on_enter( + self, state: str, reason: str | None = None, now: float = 0.0 + ) -> None: + # One INFO line per phase change narrates the whole run in the logs. + logger.info( + "greeter -> %s%s", state, f" ({reason})" if reason else "") + if state == "greet": + # Halt first: stop the patrol and override this tick's approach + # command, so the wave does not start with the dog still moving. + self._halt() + self._send_sport(_HELLO_API_ID) + elif state == "cooldown": + # Record the greet and clear the target FIRST. These are pure and + # cannot fail, so a hiccup in the fallible steps below (a VLM timeout + # or a sport-command error) can never leave the subject un-marked -- + # which would otherwise let the loop re-select and re-greet it, + # because these entered-edge effects do not run again. + if self._last_position is not None: + self._registry.mark_visited(self._last_position, now) + self._chosen = None + # Optional flair: if the subject faces the camera, reward them with a + # FingerHeart; if not (or unsure), just move on. Best-effort -- a VLM + # or sport failure here must not skip the mandatory recovery below. + # The dog is stopped, so the brief blocking gesture is fine (same + # pattern as the one-shot wave_hello). + try: + facing = self._detect_facing() + # Logged both ways so the decision is observable in sim, where + # the FingerHeart animation itself is a no-op. + logger.info( + "greeter: %r faces the camera=%s%s", + self._description, facing, + " -> FingerHeart" if facing else " -> no heart, moving on") + if facing: + if self._send_sport(_FINGER_HEART_API_ID): + time.sleep(self.config.heart_duration_s) + except Exception: # noqa: BLE001 - flair must never block recovery + logger.exception( + "greeter: facing/heart step failed; skipping it") + # Mandatory: the gesture(s) (Hello, and FingerHeart which rears up on + # its hind legs) leave the robot out of BalanceStand. Issue + # RecoveryStand now; BalanceStand is deferred to the wander edge so + # RecoveryStand gets the whole cooldown to physically stand the dog up + # before BalanceStand -- sending BalanceStand right after (a 1.5 s + # settle) interrupted the stand-up on the real robot. + self._send_sport(_RECOVERY_STAND_API_ID) + elif state == "wander": + # Record/clear state FIRST (pure, cannot fail): skip a subject we gave + # up on -- a stalled approach (reason "stuck") or one that was gone + # when we reached its last-known spot (reason "lost") -- so we do not + # re-pick and re-fail on it. Doing this before the commands below means + # a sport hiccup cannot leave it un-skipped. The revisit-forget window + # still makes it eligible again later (e.g. a person who stepped away + # returns). + if reason in ("stuck", "lost") and self._last_position is not None: + self._registry.mark_visited(self._last_position, now) + self._chosen = None + self._last_position = None + # Halt any leftover engage command, then re-enable BalanceStand so the + # resuming patrol's cmd_vel is accepted -- and, after a greet, this is + # the BalanceStand that completes the gesture recovery (RecoveryStand + # ran on the cooldown edge and has had the cooldown to settle). The + # patrol is (re)started by the next _wander_tick. + self.cmd_vel.publish(Twist.zero()) + self._send_sport(_BALANCE_STAND_API_ID) + + def _detect_facing(self) -> bool: + """Whether the just-greeted subject faces the camera (one-shot VLM). + + Asked once after the wave, while the dog is stopped, so its latency does + not matter and it costs no extra GPU (the VL model is remote). An + ambiguous or unparseable answer defaults to False, so an uncertain read + does not trigger the rear-up FingerHeart. + """ + with self._lock: + image = self._latest_image + if image is None: + return False + return detect_facing( + self._get_vl_model(), image, self._description) is True + + def _recover_locomotion(self) -> None: + """Restore a clean standing + balance state after a sport gesture. + + A gesture (Hello or the FingerHeart) leaves the robot out of + BalanceStand, so cmd_vel is ignored. Bring it back up with RecoveryStand, + let it settle, then enter BalanceStand so it can walk and do other things + again. Synchronous (blocks for ``recover_settle_s``); used both on the + loop's cooldown edge and by the one-shot wave. A no-op off-robot. + """ + self._send_sport(_RECOVERY_STAND_API_ID) + time.sleep(self.config.recover_settle_s) + self._send_sport(_BALANCE_STAND_API_ID) + + def _send_sport(self, api_id: int) -> bool: + """Issue a sport command, tolerating a transient closed data channel. + + The WebRTC publish raises ``Data channel is not open`` whenever the + channel is not in the open state -- it can briefly drop mid-session. A + failed gesture must never crash the control loop, so this retries a few + times to bridge a transient blip, then logs and gives up. Any other + failure is re-raised for the loop's fail-safe to handle. + + Args: + api_id: The SPORT_MOD api id to publish (e.g. Hello, BalanceStand). + + Returns: + True if the command was delivered, False if the channel stayed + closed across every retry. + """ + name = _SPORT_NAMES.get(api_id, "?") + connection = self._connection + if connection is None: + logger.warning( + "greeter: NO GO2 connection -- sport %s (%d) is a no-op " + "(this is why a gesture would not play)", name, api_id) + return False + last_error: Exception | None = None + for attempt in range(self.config.sport_retry_attempts): + try: + connection.publish_request( + RTC_TOPIC["SPORT_MOD"], {"api_id": api_id}) + # Log on success too -- otherwise the gesture/recovery sequence + # (Hello / RecoveryStand / BalanceStand / FingerHeart) is + # invisible in the run log, which made the "no wave / no stand + # up" reports hard to diagnose. + logger.info( + "greeter: sent sport %s (%d)%s", name, api_id, + f" [retry {attempt}]" if attempt else "") + return True + except Exception as error: # noqa: BLE001 - lib raises bare Exception + if "Data channel is not open" not in str(error): + raise # unexpected -> let the loop's fail-safe handle it + last_error = error + if attempt + 1 < self.config.sport_retry_attempts: + time.sleep(self.config.sport_retry_delay_s) + logger.warning( + "greeter: sport %s (%d) NOT delivered after %d attempts (data " + "channel not open): %s", + name, api_id, self.config.sport_retry_attempts, last_error) + return False + + # -- Tracker / models / geometry -- + + def _init_tracker(self, bbox, image: Image) -> bool: + if not valid_bbox(list(bbox), image.width, image.height): + logger.warning("greeter: VLM bbox %s invalid for the frame", bbox) + return False + box = clamp_bbox(bbox, image.width, image.height) + tracker = self._get_tracker() + detections = tracker.init_track( + image=image, box=np.array(box, dtype=np.float32), obj_id=1) + return len(detections) > 0 + + def _track(self, image: Image): + tracker = self._get_tracker() + detections = tracker.process_image(image) + people = list(getattr(detections, "detections", [])) + if not people: + return None + best = max(people, key=lambda d: d.bbox_2d_volume()) + return best + + def _locate(self, bbox, image: Image): + """Subject world position + distance for a bbox. + + Prefers the lidar pointcloud (real returns off the subject's body -- + robust to the dog's low viewpoint), and falls back to the monocular + ground raycast when no lidar is wired or no points land in the box. + + Returns ``((x, y), distance_m)`` in the world frame, or None if neither + method can produce a fix. + """ + located = self._locate_lidar(bbox, image) + if located is not None: + return located + return self._locate_raycast(bbox, image) + + def _locate_lidar(self, bbox, image: Image): + """Range the subject from the lidar returns inside its bbox. + + Projects the lidar cloud into the camera, keeps the points that fall in + the tracked box, and takes their centroid via DimOS' + :meth:`Detection3DPC.from_2d`. Unlike the ground raycast it does not + assume the subject's feet rest on a flat floor -- the failure mode that + put a seated person metres from their true range. Returns + ``((x, y), distance_m)`` in the world frame, or None if unavailable. + + Logs (rate-limited) *why* a fix did or did not happen, so a run that + falls back to the raycast says which boundary failed: no cloud arriving, + an empty cloud, a missing TF, or no lidar returns inside the box. + """ + with self._lock: + pointcloud = self._latest_pointcloud + if self._camera_info is None: + return None + if pointcloud is None: + self._log_lidar("no cloud received on /lidar yet") + return None + point_count = pointcloud.as_numpy()[0].shape[0] + if point_count == 0: + self._log_lidar("/lidar cloud is empty") + return None + # Use the cloud's own frame (DimOS' detector does the same): the tf must + # map those points into the camera, whatever frame the driver tags them. + world_to_optical = self.tf.get( + self.config.camera_optical_frame_id, + pointcloud.frame_id, + time_point=image.ts, + time_tolerance=self.config.tf_tolerance_s, + ) + if world_to_optical is None: + self._log_lidar( + "no TF %s<-%s at the image timestamp" + % (self.config.camera_optical_frame_id, pointcloud.frame_id)) + return None + detection = Detection2DBBox( + bbox=tuple(bbox), track_id=-1, class_id=0, confidence=1.0, + name=self.config.description, ts=image.ts, image=image, + ) + # No filters: the Go2 lidar is sparse, so the default outlier/raycast + # filters can drop every return inside a person's box. The bbox alone + # isolates the subject well enough for a range estimate. + detection_3d = Detection3DPC.from_2d( + detection, + world_pointcloud=pointcloud, + camera_info=self._camera_info, + world_to_optical_transform=world_to_optical, + filters=[], + ) + if detection_3d is None: + self._log_lidar( + "from_2d found no lidar returns in the bbox (cloud=%d pts)" + % point_count) + return None + # Camera origin in the cloud's own frame, so it subtracts cleanly from + # the (same-frame) detection centre. + cam = self.tf.get( + pointcloud.frame_id, + self.config.camera_optical_frame_id, + time_point=image.ts, + time_tolerance=self.config.tf_tolerance_s, + ) + if cam is None: + return None + origin = cam.to_matrix()[:3, 3] + center = detection_3d.center + distance = math.hypot(center.x - origin[0], center.y - origin[1]) + self._log_lidar( + "fix from %d-pt cloud -> (%.2f, %.2f) d=%.2fm" + % (point_count, center.x, center.y, distance)) + return ((center.x, center.y), distance) + + def _log_lidar(self, message: str) -> None: + """Rate-limited lidar-path diagnostics (the why behind a fix/fallback).""" + now = time.monotonic() + if now - self._last_lidar_log_s >= self.config.lidar_log_interval_s: + self._last_lidar_log_s = now + logger.info("greeter lidar: %s", message) + + def _locate_raycast(self, bbox, image: Image): + """Floor position + distance of a bbox via the monocular ground raycast. + + Returns ``((x, y), distance_m)`` in the world frame, or None if + intrinsics or the TF are unavailable or the ray misses the floor. + """ + if self._intrinsics is None: + return None + transform = self.tf.get( + self.config.world_frame_id, + self.config.camera_optical_frame_id, + time_point=image.ts, + time_tolerance=self.config.tf_tolerance_s, + ) + if transform is None: + return None + matrix = transform.to_matrix() + point = pixel_to_ground_point( + ground_contact_pixel(bbox), + self._intrinsics, + matrix, + ground_z=self.config.world_floor_z_m, + ) + if point is None: + return None + origin = matrix[:3, 3] + distance = math.hypot(point[0] - origin[0], point[1] - origin[1]) + return ((point[0], point[1]), distance) + + def _get_vl_model(self) -> VlModel: + with self._lock: + if self._vl_model is None: + self._vl_model = QwenChinaVlModel() + return self._vl_model + + def _get_tracker(self) -> EdgeTAMProcessor: + with self._lock: + if self._tracker is None: + self._tracker = EdgeTAMProcessor() + return self._tracker + + # -- Misc -- + + def _on_color_image(self, image: Image) -> None: + with self._lock: + self._latest_image = image + + def _on_pointcloud(self, pointcloud: PointCloud2) -> None: + with self._lock: + self._latest_pointcloud = pointcloud + + def _on_odom(self, odom: PoseStamped) -> None: + with self._lock: + self._latest_odom = odom + + def _on_costmap(self, costmap: OccupancyGrid) -> None: + with self._lock: + self._latest_costmap = costmap + + def _publish_world_pose( + self, image: Image, position: tuple[float, float] + ) -> None: + """Publish the subject's floor position (world frame) for Rerun 3D.""" + self.subject_world_pose.publish(PoseStamped( + ts=image.ts, + frame_id=self.config.world_frame_id, + position=[position[0], position[1], self.config.world_floor_z_m], + orientation=[0.0, 0.0, 0.0, 1.0], + )) + + def _publish_debug(self, image: Image, bbox, label: str) -> None: + frame = image.to_opencv().copy() + color = (0, 255, 0) if bbox is not None else (0, 0, 255) + if bbox is not None: + x1, y1, x2, y2 = (int(round(v)) for v in bbox) + cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2) + cv2.putText( + frame, label, (12, 28), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2) + self.debug_image.publish(Image.from_opencv(frame, ts=image.ts)) + + def _stop_loop(self) -> None: + self._should_stop.set() + with self._lock: + thread = self._thread + self._thread = None + if thread is not None: + thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + + def _halt(self) -> None: + """Stop the robot: zero velocity AND stop the patrol, independently. + + Used by the stop skill, module shutdown, and the loop's fail-safe path. + Each mechanism is guarded on its own so a failure in one still attempts + the other, and neither may raise -- either would leave the robot driving. + """ + try: + self.cmd_vel.publish(Twist.zero()) + except Exception: # noqa: BLE001 - last-resort halt must not raise + logger.exception("greeter: zero-velocity publish failed") + try: + patrol = getattr(self, "_patrolling_module_spec", None) + if patrol is not None and patrol.is_patrolling(): + patrol.stop_patrol() + except Exception: # noqa: BLE001 - last-resort halt must not raise + logger.exception("greeter: stop_patrol failed") + + def _reset_engagement(self) -> None: + """Clear transient per-run state for a clean wander start. + + Keeps the visited registry, so subjects greeted in an earlier run stay + skipped across restarts. + """ + self._machine.reset() + self._chosen = None + self._last_position = None + self._last_snapshot = None + self._last_trace = None + self._last_scan_s = 0.0 + self._rescanning = False + self._roaming = False + self._roam_anchor_xy = None + self._roam_progress_s = 0.0 + + def _set_error_status(self, message: str) -> None: + self._publish_status(GreeterSnapshot( + state="wander", message=message, reason="error")) + + def _publish_status( + self, snapshot: GreeterSnapshot, obs: GreeterObservation | None = None + ) -> None: + """Record + publish a rich phase trace on greeter_phase (phase + why). + + Beyond the phase itself, the trace carries the perception that drove this + tick (visible / distance), the patrol state, the subject's floor + position, and the greeted count -- so one stream explains the behavior. + """ + self._last_snapshot = snapshot + distance = ( + round(obs.distance_m, 2) + if obs is not None and obs.distance_m is not None else None + ) + position = ( + [round(self._last_position[0], 2), round(self._last_position[1], 2)] + if self._last_position is not None else None + ) + trace = { + "state": snapshot.state, + "message": snapshot.message, + "entered": snapshot.entered, + "reason": snapshot.reason, + "patrolling": self._is_patrolling(), + "subject_visible": bool(obs.subject_visible) if obs else False, + "distance_m": distance, + "subject_xy": position, + "greeted": self._registry.visited_count, + } + self._last_trace = json.dumps(trace, separators=(",", ":")) + self.greeter_phase.publish(self._last_trace) + # Persist the gate inputs at each phase change (greeter_phase itself is + # live LCM only and is not saved to the run log). This puts the + # distance / subject_xy behind every transition -- e.g. why a + # greet fired -- into main.jsonl for after-the-fact debugging. + if snapshot.entered: + logger.info("greeter phase trace: %s", self._last_trace) + + def _is_patrolling(self) -> bool: + patrol = getattr(self, "_patrolling_module_spec", None) + return patrol is not None and patrol.is_patrolling() diff --git a/hackathon/pawtrack/src/pawtrack/greeter_state.py b/hackathon/pawtrack/src/pawtrack/greeter_state.py new file mode 100644 index 0000000000..10faf8be0b --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/greeter_state.py @@ -0,0 +1,211 @@ +"""Pure state machine for the greeter loop (no DimOS). + +The greeter dog cycles through four phases: + +- ``wander``: the DimOS patrol roams the room while the container scans for a + person sitting on a chair. When it picks a random not-recently-greeted subject + (see :mod:`pawtrack.visited_registry`) the patrol stops and we engage. +- ``approach``: drive to a polite standoff while keeping the subject centered. +- ``greet``: hold still and wave hello once -- whichever way the person faces. +- ``cooldown``: brief pause; the container marks the subject visited and resumes + the patrol, then we return to ``wander``. + +The greeting is unconditional: any seated person reached at a safe standoff gets +a wave -- there is no facing / orbit-to-front requirement (an earlier design +circled until the subject faced the camera, which often never resolved). + +This module owns *which phase* the robot is in, given a per-tick observation; +the DimOS container performs the effects -- start/stop the patrol, run the +detector and tracker, drive ``cmd_vel``, fire the greeting, and mark the subject +visited -- reacting to the ``entered`` edges in the returned snapshot. + +Pure: stdlib only, unit tested. Mirrors the ``MonitorState`` pattern in +:mod:`pawtrack.track_state`. +""" + +from __future__ import annotations + +import dataclasses +import json +from typing import Literal + +State = Literal["wander", "approach", "greet", "cooldown"] + +_MESSAGES: dict[State, str] = { + "wander": "Patrolling, looking for a person sitting on a chair.", + "approach": "Approaching the subject.", + "greet": "Greeting the subject (waving hello).", + "cooldown": "Greeted; marking visited and resuming the patrol.", +} + + +@dataclasses.dataclass(frozen=True) +class GreeterParams: + """Thresholds for the greeter phase transitions. + + Attributes: + standoff_m: Polite distance to hold from the subject before greeting. + standoff_tolerance_m: Slack added to ``standoff_m`` when deciding the + robot is "close enough" to stop approaching and wave. + min_safe_distance_m: Hard floor on the distance at greeting time -- the + robot will not wave if it is closer than this (accounts for the dog's + footprint and the reach of the Hello animation). If it has drifted + inside this, the standoff hold backs it out before it can greet. + engage_timeout_s: Give up and resume patrol if approaching stalls this + long without greeting (e.g. an unreachable spot, or a subject that + left). The subject is then skipped so we do not immediately re-pick + and re-stick on it. This bounds the whole engagement, including the + dead-reckon-to-last-known phase after a track loss. + greet_duration_s: How long the greeting phase lasts. + cooldown_duration_s: How long to wait after greeting before resuming the + patrol, so the just-greeted subject is not immediately re-engaged. + """ + + standoff_m: float = 1.5 + standoff_tolerance_m: float = 0.3 + min_safe_distance_m: float = 1.0 + engage_timeout_s: float = 10.0 + greet_duration_s: float = 3.0 + cooldown_duration_s: float = 5.0 + + +@dataclasses.dataclass(frozen=True) +class GreeterObservation: + """One tick of perception for the greeter machine. + + Attributes: + target_acquired: A not-recently-greeted subject was selected this tick + (the container has chosen one and started tracking it). Triggers + ``wander -> approach``. + subject_visible: Whether the engaged subject is currently tracked (or was + re-detected on arrival). The greeting only fires while it is visible. + distance_m: Estimated floor distance to the subject, or None if unknown. + While the subject is tracked this is the camera-ranged distance; + while it is lost it is the odom distance to its last-known position. + reached_destination: While the subject is lost, whether the dog has + driven to its last-known position (dead reckoning) and is still + unable to see it -- the cue to give up that engagement. + """ + + target_acquired: bool = False + subject_visible: bool = False + distance_m: float | None = None + reached_destination: bool = False + + +@dataclasses.dataclass(frozen=True) +class GreeterSnapshot: + """The greeter phase after a tick, suitable for JSON status output. + + Attributes: + state: The current phase. + message: Human-readable description of the phase. + entered: True if this tick transitioned into ``state`` -- a one-shot + edge the container keys effects off: ``approach`` -> stop the patrol + and start tracking, ``greet`` -> fire the greeting once, ``cooldown`` + -> mark the subject visited, ``wander`` -> resume the patrol. + reason: Why this phase was entered, when it matters. ``"stuck"`` on a + ``wander`` edge means an engagement timed out (vs a merely lost + subject, ``None``), so the container can skip that subject. + """ + + state: State + message: str + entered: bool = False + reason: str | None = None + + def to_json(self) -> str: + """Serialize to compact JSON for a status stream.""" + return json.dumps(dataclasses.asdict(self), separators=(",", ":")) + + +class GreeterMachine: + """Decides the greeter phase across ticks from per-tick observations. + + Wander -> approach (a target is acquired) -> greet (visible at a safe + standoff) -> cooldown (timed) -> wander. Losing the tracker does not abandon + the approach: the container dead-reckons to the subject's last-known + position, and only once it has arrived there and still cannot see the subject + (``reached_destination`` with ``subject_visible`` false) does the machine + give up. ``engage_timeout_s`` bounds the whole engagement. Greet and cooldown + are purely timed and ignore the observation. + """ + + def __init__(self, params: GreeterParams | None = None): + self._params = params if params is not None else GreeterParams() + self._state: State = "wander" + self._state_since = 0.0 + self._pending_reason: str | None = None + + @property + def state(self) -> State: + """The current phase.""" + return self._state + + def reset(self) -> None: + """Return to a clean initial ``wander`` state, for a fresh greeter run.""" + self._state = "wander" + self._state_since = 0.0 + self._pending_reason = None + + def step(self, observation: GreeterObservation, now: float) -> GreeterSnapshot: + """Fold one tick in and return the resulting phase snapshot. + + Args: + observation: This tick's perception. + now: Current monotonic timestamp. + + Returns: + The phase snapshot; ``entered`` is True when the phase changed. + """ + self._pending_reason = None + nxt = self._next_state(observation, now) + entered = nxt != self._state + if entered: + self._state = nxt + self._state_since = now + reason = self._pending_reason if entered else None + return GreeterSnapshot( + self._state, _MESSAGES[self._state], entered, reason) + + def _next_state(self, obs: GreeterObservation, now: float) -> State: + if self._state == "wander": + return "approach" if obs.target_acquired else "wander" + if self._state == "approach": + if self._engage_timed_out(now): + return "wander" + if obs.subject_visible and self._safe_to_greet(obs): + return "greet" + if obs.reached_destination and not obs.subject_visible: + # Drove to the subject's last-known spot but it is not there + # (it moved or was a bad fix). Skip it like a stalled engagement. + self._pending_reason = "lost" + return "wander" + return "approach" + if self._state == "greet": + if now - self._state_since >= self._params.greet_duration_s: + return "cooldown" + return "greet" + # cooldown + if now - self._state_since >= self._params.cooldown_duration_s: + return "wander" + return "cooldown" + + def _within_standoff(self, obs: GreeterObservation) -> bool: + if obs.distance_m is None: + return False + limit = self._params.standoff_m + self._params.standoff_tolerance_m + return obs.distance_m <= limit + + def _safe_to_greet(self, obs: GreeterObservation) -> bool: + """At the standoff and no closer than the safe minimum (dog footprint).""" + if obs.distance_m is None: + return False + return obs.distance_m >= self._params.min_safe_distance_m and ( + self._within_standoff(obs)) + + def _engage_timed_out(self, now: float) -> bool: + if now - self._state_since > self._params.engage_timeout_s: + self._pending_reason = "stuck" + return True + return False diff --git a/hackathon/pawtrack/src/pawtrack/ground_raycast.py b/hackathon/pawtrack/src/pawtrack/ground_raycast.py new file mode 100644 index 0000000000..a12d538e94 --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/ground_raycast.py @@ -0,0 +1,122 @@ +"""Pure pixel-to-ground raycast for absolute subject positioning (no DimOS). + +Given the tracked subject's pixel in the camera image, the camera intrinsics, +and the camera's pose in a target frame, intersect the viewing ray with a +horizontal ground plane to recover the subject's position in that frame. + +This is approach "B": it needs no depth sensor and no lidar return on the +subject -- only the camera pose and a known ground height -- so it stays robust +for a subject on a flat floor, where a sparse lidar might miss it (and so +``Detection3DPC.from_2d`` would return nothing). The result is *absolute* only +insofar as the supplied camera-to-target transform is into an absolute frame +(the ``map`` frame from relocalization); given a ``world``/odom transform the +result lands in that (drifting) frame instead. + +Pure: numpy only, unit tested. The DimOS glue supplies the intrinsics (from +``CameraInfo.K``) and the 4x4 transform (``tf.get(target, camera_optical) +.to_matrix()``), then wraps the returned point in a ``PoseStamped``. +""" + +from __future__ import annotations + +import dataclasses + +import numpy as np + +# Rays whose ground-plane component is below this (in the target frame) are +# treated as parallel to the floor; the intersection would be at infinity. +_MIN_VERTICAL_COMPONENT = 1e-9 + + +@dataclasses.dataclass(frozen=True) +class CameraIntrinsics: + """Pinhole camera intrinsics, in pixels. + + Attributes: + fx: Focal length in x. + fy: Focal length in y. + cx: Principal point x. + cy: Principal point y. + """ + + fx: float + fy: float + cx: float + cy: float + + def __post_init__(self): + if self.fx <= 0.0 or self.fy <= 0.0: + raise ValueError("focal lengths fx, fy must be positive") + + @classmethod + def from_k(cls, k: list[float]) -> CameraIntrinsics: + """Build from a row-major 3x3 intrinsic matrix (e.g. ``CameraInfo.K``). + + Args: + k: Nine elements ``[fx, 0, cx, 0, fy, cy, 0, 0, 1]`` (row-major). + + Returns: + The intrinsics read from ``k``. + + Raises: + ValueError: If ``k`` does not have nine elements. + """ + if len(k) != 9: + raise ValueError("K must have 9 elements (row-major 3x3)") + return cls(fx=float(k[0]), fy=float(k[4]), cx=float(k[2]), cy=float(k[5])) + + +def pixel_to_ground_point( + pixel: tuple[float, float], + intrinsics: CameraIntrinsics, + camera_to_target: np.ndarray, + ground_z: float = 0.0, +) -> tuple[float, float, float] | None: + """Intersect a pixel's viewing ray with a horizontal ground plane. + + The pixel is back-projected to a ray in the camera optical frame (``+x`` + right, ``+y`` down, ``+z`` forward), transformed into the target frame by + ``camera_to_target``, and intersected with the plane ``z == ground_z``. + + Args: + pixel: ``(u, v)`` pixel coordinates of the point (e.g. the subject's + bbox bottom-center, where it meets the floor). + intrinsics: Pinhole camera intrinsics. + camera_to_target: 4x4 homogeneous transform mapping a camera-optical + point to the target frame -- i.e. ``tf.get(target, camera_optical) + .to_matrix()``. + ground_z: Height of the ground plane in the target frame. Pass ``0`` + (the floor) for a pixel that sits on the floor; pass an offset only + if the chosen pixel is above the floor. + + Returns: + ``(x, y, z)`` of the ray/plane intersection in the target frame, or + ``None`` if the ray is parallel to the plane or points away from it + (the plane lies behind the camera along the ray). + + Raises: + ValueError: If ``camera_to_target`` is not a 4x4 matrix. + """ + matrix = np.asarray(camera_to_target, dtype=float) + if matrix.shape != (4, 4): + raise ValueError("camera_to_target must be a 4x4 matrix") + + u, v = pixel + ray_camera = np.array( + [ + (u - intrinsics.cx) / intrinsics.fx, + (v - intrinsics.cy) / intrinsics.fy, + 1.0, + ] + ) + origin = matrix[:3, 3] + direction = matrix[:3, :3] @ ray_camera + + vertical = direction[2] + if abs(vertical) < _MIN_VERTICAL_COMPONENT: + return None # ray parallel to the ground plane + distance = (ground_z - origin[2]) / vertical + if distance <= 0.0: + return None # intersection is behind the camera along the ray + point = origin + distance * direction + return (float(point[0]), float(point[1]), float(point[2])) diff --git a/hackathon/pawtrack/src/pawtrack/identify.py b/hackathon/pawtrack/src/pawtrack/identify.py new file mode 100644 index 0000000000..85e532ab29 --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/identify.py @@ -0,0 +1,161 @@ +"""Multi-target identification: prompt a VL model for every match + parse it. + +The single-box ``get_object_bbox_from_image`` grounding returns one box; the +greeter needs *all* matching subjects so it can pick one at random. This asks the +same VL model (via its duck-typed ``query(image, prompt)``) for a JSON list of +boxes and parses it leniently. + +No DimOS import -- it works on any object with a ``query`` method and the parser +is pure, so both are unit tested without a model. The DimOS container injects the +real ``QwenChinaVlModel``. +""" + +from __future__ import annotations + +import json +import re + +BBox = tuple[float, float, float, float] + +_PROMPT = ( + "Look at this image and find EVERY {description}. Return ONLY a JSON array; " + "each element is an object {{\"bbox\": [x1, y1, x2, y2]}} where x1,y1 is the " + "top-left and x2,y2 the bottom-right corner in pixels. Return [] if there are " + "none." +) + + +def build_prompt(description: str) -> str: + """The multi-box detection prompt for a subject description.""" + return _PROMPT.format(description=description) + + +def _coerce_bbox(item: object) -> BBox | None: + """Coerce one parsed element into a 4-float bbox, or None if it is not one.""" + if isinstance(item, dict): + item = item.get("bbox") or item.get("box") or item.get("bbox_2d") + if not isinstance(item, (list, tuple)) or len(item) != 4: + return None + try: + x1, y1, x2, y2 = (float(v) for v in item) + except (TypeError, ValueError): + return None + return (x1, y1, x2, y2) + + +def parse_bboxes(response_text: str) -> list[BBox]: + """Parse a VL model's response into a list of bboxes (lenient). + + Accepts a bare JSON array (of ``{"bbox": [...]}`` objects or of raw + ``[x1, y1, x2, y2]`` lists), or an object wrapping such an array, with or + without ``` fences. Anything unparseable yields an empty list. + + Args: + response_text: Raw model output. + + Returns: + The valid 4-number boxes found, in order; empty if none. + """ + if not response_text: + return [] + text = response_text.strip() + # Strip an optional ```json ... ``` code fence. + fence = re.search(r"```(?:json)?\s*(.*?)```", text, re.DOTALL) + if fence: + text = fence.group(1).strip() + # Fall back to the outermost JSON array/object if there is surrounding prose. + if not text or text[0] not in "[{": + match = re.search(r"[\[{].*[\]}]", text, re.DOTALL) + if not match: + return [] + text = match.group(0) + try: + parsed = json.loads(text) + except json.JSONDecodeError: + return [] + if isinstance(parsed, dict): + if _coerce_bbox(parsed) is not None: + # A single detection object, e.g. {"bbox": [x1, y1, x2, y2]} -- the + # dict itself is one box (some models answer with one despite the + # multi-box prompt). + items: list[object] = [parsed] + else: + # A wrapper, e.g. {"objects": [...]} / {"boxes": [[...], ...]}: take + # its first list of detections. + items = next( + (v for v in parsed.values() if isinstance(v, list)), [] + ) + elif isinstance(parsed, list): + items = parsed + else: + return [] + boxes = [_coerce_bbox(item) for item in items] + return [box for box in boxes if box is not None] + + +def detect_all(vl_model: object, image: object, description: str) -> list[BBox]: + """Query ``vl_model`` for every ``description`` in ``image`` and parse boxes. + + Args: + vl_model: Anything with ``query(image, prompt) -> str`` (e.g. a DimOS + ``VlModel``). + image: The frame to pass through to the model. + description: Visual description of the subject (e.g. "a person sitting + on a chair"). + + Returns: + Every parsed bounding box; empty if none were found or returned. + """ + response = vl_model.query(image, build_prompt(description)) + return parse_bboxes(response) + + +def build_facing_prompt(description: str) -> str: + """Prompt asking whether the target object faces the camera (front/back). + + Parameterised by the subject ``description`` so the same check works for + whatever the greeter targets: a seated person on the robot, or a chair in + simulation. A chair shows its backrest / is seen from behind when "facing + away", so this lets the front/back logic (and the FingerHeart reward) be + exercised in sim by pointing at chairs in different orientations. + """ + return ( + "In this image, is {description} facing TOWARD the camera, or facing " + "AWAY from it? A back / backrest / rear view, or seeing it from behind, " + 'counts as facing away. Answer with exactly one word: "front" if it ' + 'faces the camera, "back" if it faces away.' + ).format(description=description) + + +def parse_facing(response_text: str) -> bool | None: + """Parse a front/back facing answer leniently. + + Returns True for a clear "front" (facing the camera), False for a clear + "back" (facing away), and None when the answer mentions both or neither + (orientation unknown) -- the caller picks a safe default. + """ + if not response_text: + return None + low = response_text.strip().lower() + front = "front" in low or "toward" in low or "towards" in low + back = "back" in low or "away" in low or "behind" in low + if front == back: # both cues, or neither -> undecided + return None + return front + + +def detect_facing( + vl_model: object, image: object, description: str +) -> bool | None: + """Ask ``vl_model`` whether the targeted subject faces the camera. + + Args: + vl_model: Anything with ``query(image, prompt) -> str``. + image: The frame to pass through to the model. + description: The subject being greeted (e.g. "a person sitting on a + chair", or "a chair" in sim), so the prompt asks about that object. + + Returns: + True if facing the camera, False if facing away, None if undecided. + """ + return parse_facing(vl_model.query(image, build_facing_prompt(description))) diff --git a/hackathon/pawtrack/src/pawtrack/image_source.py b/hackathon/pawtrack/src/pawtrack/image_source.py new file mode 100644 index 0000000000..466ba9ba66 --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/image_source.py @@ -0,0 +1,99 @@ +"""DimOS glue: publish frames from an image or video file as ``color_image``. + +A hardware-free camera source for testing the subject tracker end to end with +no robot and no webcam: point it at a photo or a short clip that contains the +subject, then describe it from the DimOS CLI. Loops the file so the stream is +continuous. Imports DimOS, so it is not unit tested. +""" + +from __future__ import annotations + +import threading +import time +from typing import Any + +import cv2 + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import Out +from dimos.msgs.sensor_msgs.Image import Image +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class Config(ModuleConfig): + """Configuration for the file image source.""" + + path: str = "" # image or video file to publish + fps: float = 15.0 + loop: bool = True # restart the video (or keep republishing the image) + + +class FileImageSource(Module): + """Publish frames from an image/video file on ``color_image``.""" + + config: Config + + color_image: Out[Image] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._stop = threading.Event() + self._thread: threading.Thread | None = None + + @rpc + def start(self) -> None: + super().start() + self._stop.clear() + self._thread = threading.Thread( + target=self._publish_frames, name="file-image-source", daemon=True + ) + self._thread.start() + + @rpc + def stop(self) -> None: + self._stop.set() + thread = self._thread + if thread is not None: + thread.join(timeout=2.0) + self._thread = None + super().stop() + + def _publish(self, frame: Any) -> None: + self.color_image.publish(Image.from_opencv(frame, ts=time.time())) + + def _wait(self) -> None: + if self.config.fps > 0: + self._stop.wait(1.0 / self.config.fps) + + def _publish_frames(self) -> None: + still = cv2.imread(self.config.path) + if still is not None: + while not self._stop.is_set(): + self._publish(still) + self._wait() + return + capture = cv2.VideoCapture(self.config.path) + try: + if not capture.isOpened(): + logger.error( + "FileImageSource: cannot open %r", self.config.path) + return + produced = False + while not self._stop.is_set(): + ok, frame = capture.read() + if not ok: + # Loop a real video back to the start; but if not a single + # frame ever decoded, the file is unusable -- stop instead of + # busy-spinning forever on the reset. + if self.config.loop and produced: + capture.set(cv2.CAP_PROP_POS_FRAMES, 0) + continue + break + produced = True + self._publish(frame) + self._wait() + finally: + capture.release() diff --git a/hackathon/pawtrack/src/pawtrack/motion_fallback.py b/hackathon/pawtrack/src/pawtrack/motion_fallback.py new file mode 100644 index 0000000000..b481556211 --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/motion_fallback.py @@ -0,0 +1,66 @@ +"""Pure frame-difference fallback for subject-motion detection. + +When the primary tracker (EdgeTAM) drops the subject -- common when a fast +mover blurs or briefly leaves the mask -- this finds the largest *moving* region +between two consecutive frames, so the monitor can cheaply re-seed the tracker +on it instead of escalating straight to the (slow, networked) VLM reacquire. + +Pure: numpy + OpenCV only, no DimOS, no models -- unit tested with synthetic +frames. Operates on BGR uint8 arrays (``Image.to_opencv()``). + +Assumes a roughly static camera: it returns the largest moving blob over the +whole frame, so under camera ego-motion (e.g. a wandering robot) the background +can dominate. Gate it on robot motion state -- or disable it via the monitor's +``motion_fallback`` config -- when the camera itself is moving. +""" + +from __future__ import annotations + +import cv2 +import numpy as np + +Bbox = tuple[float, float, float, float] + + +def detect_motion_bbox( + prev_bgr: np.ndarray, + curr_bgr: np.ndarray, + diff_threshold: int = 25, + min_area_fraction: float = 0.0008, +) -> Bbox | None: + """Bounding box of the largest moving region between two frames. + + Args: + prev_bgr: Previous frame, BGR uint8 ``(H, W, 3)``. + curr_bgr: Current frame, BGR uint8 ``(H, W, 3)``; must match ``prev``. + diff_threshold: Per-pixel grayscale difference counted as motion. + min_area_fraction: Reject moving blobs smaller than this fraction of + the image (suppresses sensor noise). + + Returns: + ``(x1, y1, x2, y2)`` of the largest moving blob, or None if nothing + moved enough or the frames are unusable. + """ + if prev_bgr.shape != curr_bgr.shape or prev_bgr.ndim != 3: + return None + height, width = curr_bgr.shape[:2] + prev_gray = cv2.cvtColor(prev_bgr, cv2.COLOR_BGR2GRAY) + curr_gray = cv2.cvtColor(curr_bgr, cv2.COLOR_BGR2GRAY) + diff = cv2.absdiff(prev_gray, curr_gray) + _, mask = cv2.threshold(diff, diff_threshold, 255, cv2.THRESH_BINARY) + # Dilate so the disjoint halves of one moving subject merge into one blob. + mask = cv2.dilate(mask, np.ones((5, 5), np.uint8), iterations=2) + + count, _labels, stats, _ = cv2.connectedComponentsWithStats(mask) + if count <= 1: # only the background component + return None + # Largest non-background component (index 0 is the background). + areas = stats[1:, cv2.CC_STAT_AREA] + best = 1 + int(np.argmax(areas)) + if stats[best, cv2.CC_STAT_AREA] < min_area_fraction * width * height: + return None + x = stats[best, cv2.CC_STAT_LEFT] + y = stats[best, cv2.CC_STAT_TOP] + w = stats[best, cv2.CC_STAT_WIDTH] + h = stats[best, cv2.CC_STAT_HEIGHT] + return (float(x), float(y), float(x + w), float(y + h)) diff --git a/hackathon/pawtrack/src/pawtrack/occupancy.py b/hackathon/pawtrack/src/pawtrack/occupancy.py new file mode 100644 index 0000000000..f08e0518f9 --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/occupancy.py @@ -0,0 +1,77 @@ +"""Reject subject candidates that land on a mapped wall (glass reflections). + +A person reflected in a glass wall is detected by the VLM, and the lidar ranges +the glass *surface* the reflection sits on, so the two agree at the standoff and +the greeter walks over and waves at the reflection. The navigation costmap +already marks that glass wall as an obstacle, so a candidate whose floor +position lands on -- or within a small clearance of -- an occupied cell is +almost certainly a reflection or a badly ranged box; a real person to greet +stands in free space. + +This module is pure (no DimOS import) so it can be unit tested with a plain +list-of-lists grid. The container adapts the DimOS ``OccupancyGrid`` into these +primitive arguments. +""" + +from __future__ import annotations + +import math +from collections.abc import Sequence + +# nav_msgs OccupancyGrid cost convention: -1 unknown, 0 free, up to 100 lethal. +# A cell at or above this cost is treated as a wall for rejection. Tunable: raise +# toward 100 to reject only near-lethal cells (keeps subjects close to walls). +DEFAULT_COST_THRESHOLD = 50 + + +def is_near_obstacle( + grid: Sequence[Sequence[int]] | None, + resolution: float, + origin_xy: tuple[float, float], + point_xy: tuple[float, float], + *, + clearance_m: float, + cost_threshold: int = DEFAULT_COST_THRESHOLD, +) -> bool: + """Whether world ``point_xy`` is on, or within ``clearance_m`` of, a wall. + + Fails open: returns False (keep the candidate) when there is no map yet, the + resolution is invalid, the point is off the grid, or only unknown/free cells + are nearby. The filter rejects only on a *known* obstacle, so a missing or + partial map never suppresses greeting. + + Args: + grid: Occupancy costs indexed ``grid[row_y][col_x]`` (the nav_msgs + convention), or None when no costmap has arrived. + resolution: Metres per cell. + origin_xy: World coordinate of grid cell (0, 0). + point_xy: World (x, y) to test. + clearance_m: Reject if an occupied cell lies within this radius. + cost_threshold: Minimum cell cost treated as an obstacle. + + Returns: + True if a cell with cost >= ``cost_threshold`` lies within + ``clearance_m`` of the point; False otherwise. + """ + if grid is None or resolution <= 0.0: + return False + height = len(grid) + if height == 0: + return False + width = len(grid[0]) + if width == 0: + return False + origin_x, origin_y = origin_xy + center_x = int((point_xy[0] - origin_x) / resolution) + center_y = int((point_xy[1] - origin_y) / resolution) + radius_cells = max(0, math.ceil(clearance_m / resolution)) + for delta_y in range(-radius_cells, radius_cells + 1): + for delta_x in range(-radius_cells, radius_cells + 1): + if math.hypot(delta_x, delta_y) * resolution > clearance_m: + continue + cell_x = center_x + delta_x + cell_y = center_y + delta_y + if 0 <= cell_x < width and 0 <= cell_y < height: + if grid[cell_y][cell_x] >= cost_threshold: + return True + return False diff --git a/hackathon/pawtrack/src/pawtrack/qwen_china.py b/hackathon/pawtrack/src/pawtrack/qwen_china.py new file mode 100644 index 0000000000..bc649f8063 --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/qwen_china.py @@ -0,0 +1,42 @@ +"""DimOS glue: Qwen VL via the Alibaba *China* DashScope endpoint. + +DimOS's ``QwenVlModel`` hardcodes the international endpoint +(``dashscope-intl.aliyuncs.com``), which returns 401 for a mainland-China Model +Studio key. This subclass points at the China endpoint and defaults to a +current grounding model. Override the model with ``PAWTRACK_VLM_MODEL``. + +``qwen-vl-max`` is the default because it returns accurate pixel-space bounding +boxes; the ``qwen3-vl-*`` models return 0-1000 normalized coordinates that the +DimOS bbox parser does not rescale. Imports DimOS; not unit tested. +""" + +from __future__ import annotations + +import functools +import os + +from openai import OpenAI + +from dimos.models.vl.qwen import QwenVlModel, QwenVlModelConfig + +_CHINA_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" +_DEFAULT_MODEL = "qwen-vl-max" + + +class QwenChinaVlModelConfig(QwenVlModelConfig): + """Qwen VL config defaulting to a current China-endpoint model.""" + + model_name: str = os.getenv("PAWTRACK_VLM_MODEL", _DEFAULT_MODEL) + + +class QwenChinaVlModel(QwenVlModel): + """QwenVlModel that talks to the Alibaba China DashScope endpoint.""" + + config: QwenChinaVlModelConfig + + @functools.cached_property + def _client(self) -> OpenAI: + api_key = self.config.api_key or os.getenv("ALIBABA_API_KEY") + if not api_key: + raise ValueError("Alibaba API key must be set in ALIBABA_API_KEY.") + return OpenAI(base_url=_CHINA_BASE_URL, api_key=api_key) diff --git a/hackathon/pawtrack/src/pawtrack/skill_container.py b/hackathon/pawtrack/src/pawtrack/skill_container.py new file mode 100644 index 0000000000..f17934fc5e --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/skill_container.py @@ -0,0 +1,667 @@ +"""DimOS skill container for PawTrack: find and track a described subject. + +One container exposes the perception stage as DimOS skills: the user (or the +agent) describes a subject -- a person sitting on a chair, a person in a red +shirt, any object -- a VLM localizes it, EdgeTAM tracks the selected subject +frame-to-frame, and JSON status (bbox, centering, size) is published. Decision +logic lives in the pure ``MonitorState``; this module runs the effects +(tracker/VLM, publish snapshots, debug frames). The monitor loop fails loudly -- +any exception is logged and surfaced as an ``error`` status rather than letting +the worker thread die silently. When EdgeTAM drops a fast-moving subject, an +optional frame-motion fallback re-seeds the tracker before retrying. + +Alongside the status stream, the tracked subject's ground position is published +for an upstream planner: the subject's ground-contact pixel (bottom-center of +the bbox) is back-projected onto the floor plane (approach "B") in the live +``world`` frame, and -- when relocalization is running -- in the prebuilt +``map`` frame too. + +This container is perception only: it never drives the robot. Motion (wander, +approach, greet) belongs to a separate container. +""" + +from __future__ import annotations + +import base64 +import threading +import time +from typing import Any + +import cv2 +import numpy as np +from reactivex.disposable import Disposable +from turbojpeg import TurboJPEG + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.models.qwen.bbox import BBox +from dimos.models.segmentation.edge_tam import EdgeTAMProcessor +from dimos.models.vl.base import VlModel +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat +from dimos.navigation.visual.query import get_object_bbox_from_image +from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox +from dimos.robot.unitree.dimsim_connection import DimSimConnection +from dimos.robot.unitree.go2.connection import GO2Connection +from dimos.robot.unitree.mujoco_connection import MujocoConnection +from dimos.utils.logging_config import setup_logger + +from pawtrack.ground_raycast import CameraIntrinsics, pixel_to_ground_point +from pawtrack.motion_fallback import detect_motion_bbox +from pawtrack.qwen_china import QwenChinaVlModel +from pawtrack.track_state import ( + MonitorParams, + MonitorState, + TrackSnapshot, + VisualObservation, + bbox_in_image, + clamp_bbox, + ground_contact_pixel, + is_bbox_shape, + valid_bbox, + visual_metrics, +) + +logger = setup_logger() + + +class Config(ModuleConfig): + """Configuration for the PawTrack subject-tracking skills.""" + + # Subject tracking. + monitor_loop_hz: float = 15.0 + max_lost_frames: int = 15 # frames without a mask before reacquiring + reacquire_interval_frames: int = 15 # VLM retry cadence while lost + max_reacquire_attempts: int = 5 # give up after this many failed retries + # Re-seed the tracker from frame motion on a miss. OFF by default: it + # re-seeds on the largest MOVING blob, which is wrong whenever the camera + # itself moves (a wandering/approaching dog makes the whole frame move, so + # it would lock onto the background). Reacquire via the VLM instead. + motion_fallback: bool = False + max_center_jump_frac: float = 2.5 # reject jumps beyond this x subject width + max_area_factor: float = 4.0 # reject area changes beyond this factor + stale_timeout_s: float = 2.0 # no fix longer than this -> "stale" + # Subject position (ground-plane raycast, approach "B"). The tracked + # subject's ground-contact pixel -- the bottom-center of the bbox, where it + # meets the floor -- is back-projected to the floor in the live world frame + # for the planner. If relocalization is live, the same point is also + # published in the stable map frame. + camera_info: CameraInfo | None = None # intrinsics; auto-resolved if None + # Floor height (z) in each frame. The raycast plane is exactly this height: + # the bottom of the bbox (feet / seat / chair base) rests on the floor, so + # no object-size offset is added. + world_floor_z_m: float = 0.0 # floor z in the live odometry frame + map_floor_z_m: float = 0.0 # floor z in the prebuilt-map frame + world_frame_id: str = "world" # always-on live odometry frame + map_frame_id: str = "map" # optional prebuilt-map frame + camera_optical_frame_id: str = "camera_optical" + tf_tolerance_s: float = 1.0 # max age for the live world/odom TF lookup + # The map<-world TF is republished only every ~2 s by relocalization, so the + # map lookup needs a looser tolerance than the live odom chain or it would + # pause for the back half of each interval even when relocalization is + # healthy. Must exceed the relocalization publish interval (2.0 s). + map_tf_tolerance_s: float = 3.0 + # While a pose frame's TF is absent, retry its lookup at most this often + # rather than every tracked frame -- ``tf.get`` warns on each miss, which + # would otherwise flood logs at the monitor loop rate. + pose_probe_interval_s: float = 2.0 + + +class PawTrackSkillContainer(Module): + """Find and track a user-described subject, publishing status and position.""" + + config: Config + + color_image: In[Image] + subject_status: Out[str] # JSON status (LCM diagnostic stream) + debug_image: Out[Image] # annotated camera frame (shows in Rerun) + subject_world_pose: Out[PoseStamped] # position in live odometry frame + subject_map_pose: Out[PoseStamped] # optional position in premap frame + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._lock = threading.RLock() + self._monitor_stop = threading.Event() + self._thread: threading.Thread | None = None + self._latest_image: Image | None = None + self._prev_image: Image | None = None + self._state = MonitorState(MonitorParams( + max_lost_frames=self.config.max_lost_frames, + reacquire_interval_frames=self.config.reacquire_interval_frames, + max_reacquire_attempts=self.config.max_reacquire_attempts, + max_center_jump_frac=self.config.max_center_jump_frac, + max_area_factor=self.config.max_area_factor, + stale_timeout_s=self.config.stale_timeout_s, + )) + self._snapshot = self._state.snapshot + self._vl_model: VlModel | None = None + self._tracker: EdgeTAMProcessor | None = None + self._intrinsics = self._resolve_intrinsics() + self._warned_no_world_tf = False + self._warned_no_map_tf = False + # Monotonic time before which a pose frame's TF lookup is skipped, per + # frame_id, to throttle retries while that frame is absent. + self._next_pose_probe_s: dict[str, float] = {} + + def _resolve_intrinsics(self) -> CameraIntrinsics | None: + """Pinhole intrinsics for the ground-plane raycast. + + Prefer an explicit ``config.camera_info``; otherwise use the static + intrinsics of whichever connection backs this run (sim or the real + Go2), so the raycast works without extra blueprint wiring. Returns + ``None`` if no usable intrinsics are found (pose publishing then + no-ops). + """ + camera_info = self.config.camera_info + if camera_info is None: + simulation = self.config.g.simulation + if simulation == "mujoco": + camera_info = MujocoConnection.camera_info_static + elif simulation == "dimsim": + camera_info = DimSimConnection.camera_info_static + else: + camera_info = GO2Connection.camera_info_static + if ( + camera_info is None + or len(camera_info.K) != 9 + or camera_info.K[0] <= 0.0 + ): + return None + return CameraIntrinsics.from_k(list(camera_info.K)) + + @rpc + def start(self) -> None: + super().start() + self.register_disposable( + Disposable(self.color_image.subscribe(self._on_color_image)) + ) + + @rpc + def stop(self) -> None: + self._stop_monitoring() + with self._lock: + tracker = self._tracker + self._tracker = None + vl_model = self._vl_model + self._vl_model = None + if tracker is not None: + tracker.stop() + if vl_model is not None: + vl_model.stop() + super().stop() + + # -- Perception skills -- + + @skill + def track_subject( + self, + description: str = "a person sitting on a chair", + initial_bbox: list[float] | None = None, + initial_image: str | None = None, + ) -> str: + """Start tracking the subject matching a visual description. + + Use this to find, track, watch, or monitor anything the user describes + -- a person sitting on a chair, a person in a red shirt, a bottle, a + backpack -- so never refuse based on what the subject is. The + description should identify it visually, for example "a person sitting + on a chair" or "the person in the blue jacket". With no description, it + looks for a person sitting on a chair (the greeter's default target). + + Args: + description: Visual description of the subject to track. Defaults to + "a person sitting on a chair". + initial_bbox: Optional bbox ``[x1, y1, x2, y2]`` to skip VLM + acquisition when another tool or UI already selected the subject. + initial_image: Optional base64 JPEG matching ``initial_bbox``. + + Returns: + Status text describing whether tracking started. + """ + if initial_bbox is not None and not is_bbox_shape(initial_bbox): + return "initial_bbox must be four numbers [x1, y1, x2, y2]." + + self._stop_monitoring() + + with self._lock: + latest_image = self._latest_image + if latest_image is None: + return "No image available to identify the subject." + + # Reset the state machine for the new target before EITHER acquisition + # path, so a previous track's last fix cannot carry forward under the + # new description (via the drift gate or a carried-forward bbox). + self._set_snapshot(self._state.begin(description)) + + detection_image = None + if initial_bbox is not None: + bbox: BBox = ( + initial_bbox[0], + initial_bbox[1], + initial_bbox[2], + initial_bbox[3], + ) + if initial_image is not None: + detection_image = _decode_base64_image(initial_image) + else: + detected = get_object_bbox_from_image( + self._get_vl_model(), + latest_image, + description, + ) + if detected is None: + self._set_snapshot(TrackSnapshot( + status="lost", + description=description, + message=f"Could not find {description} in view.", + )) + return f"Could not find '{description}' in the current view." + bbox = detected + + return self._start_tracking(description, bbox, detection_image) + + @skill + def stop_tracking(self) -> str: + """Stop tracking the current subject.""" + self._stop_monitoring() + self._set_snapshot(self._state.stopped()) + return "Stopped tracking." + + @skill + def tracking_status(self) -> str: + """Return the latest tracked-subject bbox, size, and centering metrics.""" + with self._lock: + snapshot = self._snapshot + return snapshot.to_json() + + # -- Internals: perception -- + + def _on_color_image(self, image: Image) -> None: + with self._lock: + self._latest_image = image + + def _get_vl_model(self) -> VlModel: + with self._lock: + if self._vl_model is None: + self._vl_model = QwenChinaVlModel() + return self._vl_model + + def _get_tracker(self) -> EdgeTAMProcessor: + with self._lock: + if self._tracker is None: + self._tracker = EdgeTAMProcessor() + return self._tracker + + def _start_tracking( + self, + description: str, + bbox: BBox, + detection_image: Image | None, + ) -> str: + with self._lock: + latest_image = self._latest_image + if latest_image is None: + return "No image available to start tracking." + init_image = ( + detection_image if detection_image is not None else latest_image + ) + if not valid_bbox(bbox, init_image.width, init_image.height): + logger.error( + "track_subject: invalid bbox %s for a %dx%d image", + bbox, init_image.width, init_image.height, + ) + self._set_snapshot(self._state.errored( + description, f"Invalid bounding box for {description}." + )) + return ( + f"Invalid bounding box for '{description}'; cannot start " + "tracking." + ) + bbox = clamp_bbox(bbox, init_image.width, init_image.height) + + tracker = self._get_tracker() + initial_detections = tracker.init_track( + image=init_image, box=np.array(bbox, dtype=np.float32), obj_id=1 + ) + if len(initial_detections) == 0: + self._set_snapshot(TrackSnapshot( + status="lost", + description=description, + message=f"Tracker could not initialize on {description}.", + )) + return f"Could not initialize tracking for '{description}'." + + self.start_tool("track_subject") + with self._lock: + self._prev_image = None + self._monitor_stop.clear() + self._thread = threading.Thread( + target=self._monitor_loop, + args=(tracker, description), + name="subject-monitor", + daemon=True, + ) + self._thread.start() + + return ( + f"Started tracking '{description}'. Call tracking_status for" + " the latest size and position metrics." + ) + + def _monitor_loop( + self, + tracker: EdgeTAMProcessor, + description: str, + ) -> None: + period = 1.0 / self.config.monitor_loop_hz + next_time = time.monotonic() + + while not self._monitor_stop.is_set(): + next_time += period + try: + with self._lock: + image = self._latest_image + detection = self._track_frame(tracker, image) + metrics = ( + self._metrics(image, detection) + if detection is not None else None + ) + snapshot, action = self._state.observe( + description, metrics, time.monotonic()) + self._set_snapshot(snapshot) + if action == "give_up": + break + if action == "reacquire" and image is not None: + if self._reacquire(tracker, description, image): + self._state.reacquired() + if image is not None: + drawn = detection if snapshot.status == "tracking" else None + self._publish_debug(description, image, drawn) + if drawn is not None and snapshot.bbox is not None: + self._publish_ground_poses(image, snapshot.bbox) + except Exception: # noqa: BLE001 - thread-boundary guard + # Deliberately broad: this is the worker-thread top level. Fail + # LOUDLY (full traceback + an `error` status) instead of letting + # the thread die silently and freeze the status. + logger.exception( + "subject monitor loop crashed for %r", description) + self._set_snapshot(self._state.errored( + description, "Monitor loop error; see logs.")) + break + + now = time.monotonic() + sleep_duration = next_time - now + if sleep_duration > 0: + self._monitor_stop.wait(sleep_duration) + else: + # Fell behind (e.g. a slow VLM reacquire); drop the banked lag + # instead of spinning flat-out forever. + next_time = now + + self.stop_tool("track_subject") + + def _track_frame( + self, tracker: EdgeTAMProcessor, image: Image | None + ) -> Detection2DBBox | None: + """Track one frame, with a frame-motion re-seed fallback on a miss.""" + if image is None: + return None + detection = self._tracked_detection(tracker, image) + prev_image = self._prev_image + self._prev_image = image + if detection is not None: + return detection + # EdgeTAM lost it: try to recover from frame motion (cheap, no VLM). + # NOTE: this re-seeds on the largest moving blob over the whole frame, + # which assumes a roughly static camera. Under robot ego-motion the + # background can dominate and re-seed onto the wrong thing -- disable + # via the `motion_fallback` config (off by default), or gate it on + # robot motion state once a motion layer drives the camera. + if not self.config.motion_fallback or prev_image is None: + return None + bbox = detect_motion_bbox(prev_image.to_opencv(), image.to_opencv()) + if bbox is None or not bbox_in_image(bbox, image.width, image.height): + return None + tracker.init_track( + image=image, box=np.array(bbox, dtype=np.float32), obj_id=1) + recovered = self._tracked_detection(tracker, image) + if recovered is not None: + # Motion recovery is a reacquire: re-baseline so the drift gate + # does not reject the (possibly far) recovered position. + self._state.reacquired() + return recovered + + def _tracked_detection( + self, tracker: EdgeTAMProcessor, image: Image + ) -> Detection2DBBox | None: + detections = tracker.process_image(image) + if len(detections) == 0: + return None + best = max(detections.detections, key=lambda d: d.bbox_2d_volume()) + if not bbox_in_image(best.bbox, image.width, image.height): + return None + return best + + def _reacquire( + self, tracker: EdgeTAMProcessor, description: str, image: Image + ) -> bool: + """Re-run the VLM for the description and re-init the tracker on it.""" + bbox = get_object_bbox_from_image( + self._get_vl_model(), image, description + ) + if bbox is None: + return False + if not valid_bbox(bbox, image.width, image.height): + logger.error( + "reacquire: VLM returned an invalid bbox %s for %r", + bbox, description, + ) + return False + bbox = clamp_bbox(bbox, image.width, image.height) + detections = tracker.init_track( + image=image, box=np.array(bbox, dtype=np.float32), obj_id=1 + ) + return len(detections) > 0 + + def _metrics(self, image: Image, detection: Detection2DBBox): + mask = getattr(detection, "mask", None) + mask_area = None if mask is None else float(np.count_nonzero(mask)) + return visual_metrics(VisualObservation( + bbox=detection.bbox, + image_width=image.width, + image_height=image.height, + confidence=detection.confidence, + mask_area_px=mask_area, + )) + + def _publish_debug( + self, + description: str, + image: Image, + detection: Detection2DBBox | None, + ) -> None: + """Publish an annotated frame for Rerun: green box when tracking, red + label while searching.""" + if detection is not None: + frame = _draw_overlay( + image, detection.bbox, description, (0, 255, 0) + ) + else: + frame = _draw_overlay( + image, None, f"searching: {description}", (0, 0, 255) + ) + self.debug_image.publish(frame) + + def _publish_ground_poses( + self, image: Image, bbox: tuple[float, float, float, float] + ) -> None: + """Publish the tracked subject's ground position for planning. + + Back-projects the subject's ground-contact pixel -- the bottom-center + of its bbox, where it meets the floor -- onto the floor plane (approach + "B") and publishes it as ``PoseStamped`` in the live ``world`` frame. + If relocalization is publishing a prebuilt ``map`` frame, also publishes + the same observation there. Tracking is never blocked on missing TF. + + Args: + image: The frame the detection came from (supplies the timestamp). + bbox: ``(x1, y1, x2, y2)`` of the tracked subject in pixels. + """ + contact_px = ground_contact_pixel(bbox) + self._publish_ground_pose( + image, + contact_px, + self.config.world_frame_id, + self.config.world_floor_z_m, + self.subject_world_pose, + self.config.tf_tolerance_s, + required=True, + ) + self._publish_ground_pose( + image, + contact_px, + self.config.map_frame_id, + self.config.map_floor_z_m, + self.subject_map_pose, + self.config.map_tf_tolerance_s, + required=False, + ) + + def _publish_ground_pose( + self, + image: Image, + contact_px: tuple[float, float], + frame_id: str, + floor_z: float, + stream: Out[PoseStamped], + tf_tolerance: float, + required: bool, + ) -> None: + """Publish one ground-raycasted subject pose in ``frame_id``. + + Args: + contact_px: ``(u, v)`` pixel where the subject meets the floor. + floor_z: Floor height in ``frame_id``; the raycast plane is exactly + ``floor_z`` because the contact pixel rests on the floor. + tf_tolerance: Max age for the ``frame_id <- camera_optical`` lookup. + The map frame's TF is republished slowly, so it needs a looser + tolerance than the live odom chain. + required: Whether this frame should always be present (live world) + vs optional (prebuilt map); only changes the warning wording. + """ + if self._intrinsics is None: + return + # Throttle retries while this frame's TF is absent: tf.get logs on every + # miss, so probing each tracked frame would flood logs at loop rate. + now = time.monotonic() + if now < self._next_pose_probe_s.get(frame_id, 0.0): + return + transform = self.tf.get( + frame_id, + self.config.camera_optical_frame_id, + time_point=image.ts, + time_tolerance=tf_tolerance, + ) + if transform is None: + self._next_pose_probe_s[frame_id] = ( + now + self.config.pose_probe_interval_s) + self._warn_missing_tf_once(frame_id, required) + return + self._next_pose_probe_s[frame_id] = 0.0 # present -> probe every frame + point = pixel_to_ground_point( + contact_px, + self._intrinsics, + transform.to_matrix(), + ground_z=floor_z, + ) + if point is None: + return + stream.publish(PoseStamped( + ts=image.ts, + frame_id=frame_id, + position=list(point), + orientation=[0.0, 0.0, 0.0, 1.0], + )) + + def _warn_missing_tf_once(self, frame_id: str, required: bool) -> None: + """Log a single warning the first time a pose frame's TF is missing.""" + if required and not self._warned_no_world_tf: + logger.warning( + "No %r<-%r transform yet; subject_world_pose paused.", + frame_id, + self.config.camera_optical_frame_id, + ) + self._warned_no_world_tf = True + elif not required and not self._warned_no_map_tf: + logger.warning( + "No %r<-%r transform yet; optional subject_map_pose paused " + "until relocalization publishes the map frame.", + frame_id, + self.config.camera_optical_frame_id, + ) + self._warned_no_map_tf = True + + def _set_snapshot(self, snapshot: TrackSnapshot) -> None: + with self._lock: + previous = self._snapshot + self._snapshot = snapshot + # Publish live tracking every frame; otherwise only on a real change, + # so a long "lost" stretch does not spam identical status messages. + if ( + snapshot.status == "tracking" + or snapshot.status != previous.status + or snapshot.message != previous.message + ): + self.subject_status.publish(snapshot.to_json()) + + def _stop_monitoring(self) -> None: + self._monitor_stop.set() + with self._lock: + thread = self._thread + self._thread = None + # Join outside the lock; the loop thread takes the lock each iteration. + if thread is not None: + thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + + +def _draw_overlay( + image: Image, + bbox: tuple[float, float, float, float] | None, + label: str, + color: tuple[int, int, int], +) -> Image: + """Draw the tracked bbox and a label on a copy of the frame (BGR).""" + frame = image.to_opencv().copy() + if bbox is not None: + x1, y1, x2, y2 = (int(round(v)) for v in bbox) + cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2) + text_y = y1 - 8 if y1 > 16 else y1 + 16 + cv2.putText( + frame, + label, + (x1, text_y), + cv2.FONT_HERSHEY_SIMPLEX, + 0.6, + color, + 2, + ) + else: + cv2.putText( + frame, + label, + (12, 28), + cv2.FONT_HERSHEY_SIMPLEX, + 0.7, + color, + 2, + ) + return Image.from_opencv(frame, ts=image.ts) + + +def _decode_base64_image(b64: str) -> Image: + bgr_array = TurboJPEG().decode(base64.b64decode(b64)) + return Image(data=bgr_array, format=ImageFormat.BGR) diff --git a/hackathon/pawtrack/src/pawtrack/system_prompt.py b/hackathon/pawtrack/src/pawtrack/system_prompt.py new file mode 100644 index 0000000000..7ec26409cf --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/system_prompt.py @@ -0,0 +1,39 @@ +"""System prompt for the greeter agent's perception stage. + +Plain string, no DimOS import. Passed to ``McpClient`` by the launcher. +""" + +from __future__ import annotations + +PAWTRACK_PROMPT = """\ +You run the perception stage of a Unitree Go2 quadruped that roams a room, finds +a person sitting on a chair, and walks over to greet them. Your job is to find +and keep track of that person; the wander/approach/greet motion is handled by a +separate layer, not by you. + +You CANNOT see images yourself. Never call observe, or any tool that returns a +picture -- it will fail. All perception happens inside track_subject, which runs +the vision model for you and reports back as text via tracking_status. Do not +"look first" -- just call track_subject with the description. + +track_subject works on ANY description, not only people -- never refuse a request +because of what the subject is. + +Tools: +- track_subject(description): find the described subject in the current camera + view (default target: "a person sitting on a chair"; also e.g. "the person in + the red shirt") and start tracking them frame to frame. +- tracking_status(): report the latest tracking state -- bounding box, how + centered the subject is, apparent size, and how recently it was seen. +- stop_tracking(): stop tracking the current subject. + +When asked to find or watch someone, call track_subject immediately with a clear +visual description (default "a person sitting on a chair"). Use tracking_status +when asked where they are. Keep replies short and concrete. +""" + +# Note: the greeter blueprint deliberately does NOT use a narrow prompt. It runs +# the stock general Go2 agent (McpClient's default SYSTEM_PROMPT) and simply adds +# the greeter skills (start_greeting / stop_greeting / greeter_status / +# wave_hello); the agent discovers and calls them from their docstrings, like any +# other skill, while remaining a general-purpose agent. diff --git a/hackathon/pawtrack/src/pawtrack/track_state.py b/hackathon/pawtrack/src/pawtrack/track_state.py new file mode 100644 index 0000000000..c6396128cc --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/track_state.py @@ -0,0 +1,433 @@ +"""Pure subject-tracking logic: visual metrics + monitor state machine. + +Two cohesive, dependency-free concerns for following a tracked subject frame to +frame: + +- Visual metrics: summarize a tracked bbox/mask into stable image-space metrics + (size and centering) for the status stream (:class:`VisualObservation`, + :class:`TrackMetrics`, :func:`visual_metrics`), plus the ground-contact pixel + for back-projection (:func:`ground_contact_pixel`). +- Monitor state machine: fold each frame's metrics (or a miss) into a status + snapshot, applying a motion-aware drift gate and deciding when to reacquire + (:class:`TrackSnapshot`, :class:`MonitorParams`, :class:`MonitorState`). + +No DimOS imports -- unit tested. The DimOS ``PawTrackSkillContainer`` +performs the effects (run the tracker/VLM, publish snapshots) and feeds this +machine each frame's result via :meth:`MonitorState.observe`; the machine +decides the status and what to do next. +""" + +from __future__ import annotations + +import dataclasses +import json +import math +import time +from typing import Literal + + +@dataclasses.dataclass(frozen=True) +class VisualObservation: + """One tracked-subject observation in image coordinates. + + Attributes: + bbox: Detection box ``(x1, y1, x2, y2)`` in pixels. + image_width: Source image width in pixels. + image_height: Source image height in pixels. + confidence: Tracker/detector confidence. + mask_area_px: Foreground mask area in pixels, when available. + """ + + bbox: tuple[float, float, float, float] + image_width: int + image_height: int + confidence: float = 1.0 + mask_area_px: float | None = None + + +@dataclasses.dataclass(frozen=True) +class TrackMetrics: + """Size and centering metrics for a tracked subject.""" + + bbox: tuple[float, float, float, float] + center_px: tuple[float, float] + width_px: float + height_px: float + area_px: float + area_ratio: float + image_error_x: float + image_error_y: float + confidence: float + mask_area_px: float | None + mask_area_ratio: float | None + + +def visual_metrics(observation: VisualObservation) -> TrackMetrics: + """Compute image-space monitoring metrics for one tracked subject. + + Args: + observation: Tracked subject bbox/mask metadata for one frame. + + Returns: + Normalized centering and size metrics suitable for status streams. + + Raises: + ValueError: If image dimensions or bbox dimensions are invalid. + """ + if observation.image_width <= 0 or observation.image_height <= 0: + raise ValueError("image dimensions must be positive") + + x1, y1, x2, y2 = observation.bbox + width = x2 - x1 + height = y2 - y1 + if width <= 0 or height <= 0: + raise ValueError("bbox dimensions must be positive") + + center_x = (x1 + x2) / 2.0 + center_y = (y1 + y2) / 2.0 + image_area = float(observation.image_width * observation.image_height) + area = width * height + mask_ratio = ( + observation.mask_area_px / image_area + if observation.mask_area_px is not None + else None + ) + return TrackMetrics( + bbox=observation.bbox, + center_px=(center_x, center_y), + width_px=width, + height_px=height, + area_px=area, + area_ratio=area / image_area, + image_error_x=(center_x - observation.image_width / 2.0) + / (observation.image_width / 2.0), + image_error_y=(center_y - observation.image_height / 2.0) + / (observation.image_height / 2.0), + confidence=observation.confidence, + mask_area_px=observation.mask_area_px, + mask_area_ratio=mask_ratio, + ) + + +Bbox = tuple[float, float, float, float] + + +def ground_contact_pixel(bbox: Bbox) -> tuple[float, float]: + """Pixel where the tracked subject meets the floor: bbox bottom-center. + + A standing or seated subject (or the chair it sits on) touches the floor at + the bottom edge of its bounding box, so the bottom-center pixel back-projects + to the subject's position on the ground. The bbox *center* would instead sit + at torso/seat height and project to a biased ground point. + + Args: + bbox: Detection box ``(x1, y1, x2, y2)`` in pixels. + + Returns: + ``(u, v)`` pixel at the horizontal center of the bbox's bottom edge. + """ + x1, _y1, x2, y2 = bbox + return ((x1 + x2) / 2.0, y2) + + +def is_bbox_shape(values: list[float] | None) -> bool: + """Whether ``values`` is a length-4 sequence of finite numbers. + + A structural guard for a caller-supplied box (e.g. an LLM tool argument) + before it is indexed as ``[x1, y1, x2, y2]``, so a malformed value returns + a clean error instead of raising. + """ + if values is None: + return False + try: + length_ok = len(values) == 4 + except TypeError: + return False + if not length_ok: + return False + return all( + isinstance(v, (int, float)) and math.isfinite(v) for v in values + ) + + +def bbox_in_image(bbox: Bbox, width: int, height: int) -> bool: + """Whether a bbox is non-degenerate and overlaps the image.""" + x1, y1, x2, y2 = bbox + if x2 <= x1 or y2 <= y1: + return False + return x2 > 0 and y2 > 0 and x1 < width and y1 < height + + +def valid_bbox(bbox: list[float] | None, width: int, height: int) -> bool: + """Whether a bbox is finite, well-formed, and overlaps the image.""" + if not is_bbox_shape(bbox): + return False + return bbox_in_image((bbox[0], bbox[1], bbox[2], bbox[3]), width, height) + + +def clamp_bbox(bbox: Bbox, width: int, height: int) -> Bbox: + """Clamp a (valid, overlapping) bbox to image bounds. + + EdgeTAM is initialized from this box, so a box that spills past the frame + edge -- common for a VLM box on a subject against an edge -- would otherwise + distort initialization. Clamping keeps the visible portion rather than + rejecting the subject. A box that passed :func:`valid_bbox` stays + non-degenerate after clamping. + """ + x1, y1, x2, y2 = bbox + fw, fh = float(width), float(height) + return ( + min(max(x1, 0.0), fw), + min(max(y1, 0.0), fh), + min(max(x2, 0.0), fw), + min(max(y2, 0.0), fh), + ) + + +Status = Literal[ + "idle", "acquiring", "tracking", "lost", "stale", "stopped", "error" +] +# What the control loop should do after a frame. +Action = Literal["track", "coast", "reacquire", "give_up"] + + +@dataclasses.dataclass(frozen=True) +class TrackSnapshot: + """Latest monitor state, suitable for JSON status output.""" + + status: Status + description: str | None + message: str + seen_at: float | None = None + bbox: tuple[float, float, float, float] | None = None + center_px: tuple[float, float] | None = None + width_px: float | None = None + height_px: float | None = None + area_px: float | None = None + area_ratio: float | None = None + image_error_x: float | None = None + image_error_y: float | None = None + confidence: float | None = None + mask_area_px: float | None = None + mask_area_ratio: float | None = None + + def to_json(self, now: float | None = None) -> str: + """Serialize with a derived age field; drop the raw monotonic stamp. + + Args: + now: Current monotonic timestamp. Defaults to ``time.monotonic()``. + + Returns: + Compact JSON for the ``subject_status`` stream. + """ + now = time.monotonic() if now is None else now + data = dataclasses.asdict(self) + data["last_seen_age_s"] = ( + None if self.seen_at is None else max(0.0, now - self.seen_at) + ) + # seen_at is a raw monotonic clock value, meaningless to consumers. + data.pop("seen_at", None) + return json.dumps(data, separators=(",", ":")) + + +@dataclasses.dataclass(frozen=True) +class MonitorParams: + """Thresholds for the monitor state machine.""" + + max_lost_frames: int = 15 # coast this long before reacquiring + reacquire_interval_frames: int = 15 # reacquire cadence while lost + max_reacquire_attempts: int = 5 # give up after this many failed retries + max_center_jump_frac: float = 2.5 # reject jumps beyond this x subject width + max_area_factor: float = 4.0 # reject area changes beyond this factor + stale_timeout_s: float = 2.0 # no fix longer than this -> "stale" + + +def _from_metrics( + status: Status, + description: str, + message: str, + metrics: TrackMetrics, + seen_at: float, +) -> TrackSnapshot: + return TrackSnapshot( + status=status, + description=description, + message=message, + seen_at=seen_at, + bbox=metrics.bbox, + center_px=metrics.center_px, + width_px=metrics.width_px, + height_px=metrics.height_px, + area_px=metrics.area_px, + area_ratio=metrics.area_ratio, + image_error_x=metrics.image_error_x, + image_error_y=metrics.image_error_y, + confidence=metrics.confidence, + mask_area_px=metrics.mask_area_px, + mask_area_ratio=metrics.mask_area_ratio, + ) + + +class MonitorState: + """Tracks monitor status across frames and decides when to reacquire. + + Feed each frame to :meth:`observe` with its tracked metrics (or None). A + detection that jumps too far or changes area too abruptly versus the last + good frame is rejected as drift (treated as a miss). A ``lost``/``stale``/ + ``error`` snapshot carries the *last good* fix forward (bbox + metrics + + timestamp), only changing the status and message. + """ + + def __init__(self, params: MonitorParams | None = None): + self._params = params if params is not None else MonitorParams() + self._lost = 0 + self._reacquire_attempts = 0 + self._skip_gate = False + self._last_tracked: TrackSnapshot | None = None + self._snapshot = TrackSnapshot( + status="idle", + description=None, + message="Nothing is being tracked.", + ) + + @property + def snapshot(self) -> TrackSnapshot: + """The current authoritative snapshot.""" + return self._snapshot + + def begin(self, description: str) -> TrackSnapshot: + """Enter ``acquiring`` for a new target; reset counters/history.""" + self._lost = 0 + self._reacquire_attempts = 0 + self._skip_gate = False + self._last_tracked = None + self._snapshot = TrackSnapshot( + status="acquiring", + description=description, + message=f"Looking for {description}.", + ) + return self._snapshot + + def observe( + self, + description: str, + metrics: TrackMetrics | None, + now: float, + ) -> tuple[TrackSnapshot, Action]: + """Fold one frame's result in and decide the next action. + + Args: + description: The tracked subject's description. + metrics: This frame's tracked metrics, or None if the tracker had + no usable detection. + now: Current monotonic timestamp. + + Returns: + ``(snapshot, action)``; action is one of ``track``/``coast``/ + ``reacquire``/``give_up``. + """ + if metrics is not None and self._is_plausible(metrics): + return self._accept(description, metrics, now), "track" + return self._miss(description, now) + + def reacquired(self) -> None: + """A reacquire (VLM or motion fallback) succeeded; reset + re-baseline. + + Clears the lost/reacquire counters and skips the drift gate on the next + frame, since the new fix may legitimately be far from the old one. + """ + self._lost = 0 + self._reacquire_attempts = 0 + self._skip_gate = True + + def errored(self, description: str, message: str) -> TrackSnapshot: + """Surface a loud error status, keeping the last known fix.""" + self._snapshot = self._carry_forward("error", description, message) + return self._snapshot + + def stopped(self) -> TrackSnapshot: + """Reset to a clean stopped state.""" + self._lost = 0 + self._reacquire_attempts = 0 + self._skip_gate = False + self._last_tracked = None + self._snapshot = TrackSnapshot( + status="stopped", + description=None, + message="Stopped tracking.", + ) + return self._snapshot + + def _is_plausible(self, metrics: TrackMetrics) -> bool: + last = self._last_tracked + if self._skip_gate or last is None or last.center_px is None: + return True + ref = max(metrics.width_px, last.width_px or 0.0, 1.0) + jump = math.dist(last.center_px, metrics.center_px) + if jump > self._params.max_center_jump_frac * ref: + return False + if last.area_px and metrics.area_px: + factor = max( + metrics.area_px / last.area_px, + last.area_px / metrics.area_px, + ) + if factor > self._params.max_area_factor: + return False + return True + + def _accept( + self, description: str, metrics: TrackMetrics, now: float + ) -> TrackSnapshot: + self._lost = 0 + self._reacquire_attempts = 0 + self._skip_gate = False + snap = _from_metrics( + "tracking", description, f"Tracking {description}.", metrics, now + ) + self._last_tracked = snap + self._snapshot = snap + return snap + + def _miss( + self, description: str, now: float + ) -> tuple[TrackSnapshot, Action]: + self._lost += 1 + action: Action = "coast" + message = "Lost sight of the target." + due = ( + self._lost >= self._params.max_lost_frames + and (self._lost - self._params.max_lost_frames) + % self._params.reacquire_interval_frames == 0 + ) + if due: + self._reacquire_attempts += 1 + if self._reacquire_attempts > self._params.max_reacquire_attempts: + action = "give_up" + message = "Gave up reacquiring the target." + else: + action = "reacquire" + status: Status = "lost" + last = self._last_tracked + if ( + last is not None + and last.seen_at is not None + and now - last.seen_at > self._params.stale_timeout_s + ): + status = "stale" + if action == "coast": + message = "No fix for a while; the target is stale." + snap = self._carry_forward(status, description, message) + self._snapshot = snap + return snap, action + + def _carry_forward( + self, status: Status, description: str, message: str + ) -> TrackSnapshot: + if self._last_tracked is None: + return TrackSnapshot(status, description, message) + return dataclasses.replace( + self._last_tracked, + status=status, + description=description, + message=message, + ) diff --git a/hackathon/pawtrack/src/pawtrack/visited_registry.py b/hackathon/pawtrack/src/pawtrack/visited_registry.py new file mode 100644 index 0000000000..3d4f60a52c --- /dev/null +++ b/hackathon/pawtrack/src/pawtrack/visited_registry.py @@ -0,0 +1,120 @@ +"""Pure visited-subject memory + random target selection (no DimOS). + +The greeter must not greet the same person twice in quick succession and, when +several people are seated in view, should pick one at random. This module is the +dependency-free core of that: it remembers the floor position (world/map frame) +and time of each greeted subject, treats a freshly detected candidate within +``revisit_radius_m`` of any *recently* greeted position as already greeted, and +chooses one of the remaining candidates at random. + +A greeting is forgotten after ``forget_after_s`` seconds, so the same person +becomes greet-eligible again -- the demo stays lively instead of going quiet once +everyone in the room has been greeted once. ``forget_after_s=None`` keeps the +old "greet each subject only once per run" behaviour. + +Identity is positional, not visual: two detections close together on the floor +are the same subject. That is robust for seated people (their chair stays put) +and needs only the ground position the tracker already publishes. + +Pure: stdlib only, unit tested. The caller injects a ``random.Random`` so runs +are reproducible, and the current timestamp so the forget logic is testable +without a clock. The DimOS container supplies candidate floor positions (from the +ground raycast) and marks a subject visited once it has been greeted. +""" + +from __future__ import annotations + +import dataclasses +import math +import random + +Position = tuple[float, float] + + +@dataclasses.dataclass(frozen=True) +class Candidate: + """One detected person-on-a-chair to consider greeting. + + Attributes: + position: ``(x, y)`` floor position in the world/map frame. + bbox: Detection box ``(x1, y1, x2, y2)`` in pixels, for the tracker. + """ + + position: Position + bbox: tuple[float, float, float, float] + + +class VisitedRegistry: + """Remembers recently greeted floor positions and filters candidates. + + A candidate within ``revisit_radius_m`` of a position greeted less than + ``forget_after_s`` ago counts as already greeted. ``revisit_radius_m`` should + comfortably exceed the tracker's position noise and a chair's footprint so + one subject is not greeted twice in a row; ``forget_after_s`` sets how long + before the same subject may be greeted again (``None`` = never). + """ + + def __init__( + self, + revisit_radius_m: float = 1.0, + forget_after_s: float | None = None, + ): + if revisit_radius_m <= 0.0: + raise ValueError("revisit_radius_m must be positive") + if forget_after_s is not None and forget_after_s <= 0.0: + raise ValueError("forget_after_s must be positive or None") + self._radius = revisit_radius_m + self._forget_after = forget_after_s + self._visited: list[tuple[Position, float]] = [] + + @property + def visited_count(self) -> int: + """How many greetings have been recorded (counts repeats).""" + return len(self._visited) + + def _expired(self, when: float, now: float) -> bool: + return self._forget_after is not None and now - when > self._forget_after + + def is_visited(self, position: Position, now: float = 0.0) -> bool: + """Whether ``position`` was greeted within the radius and not forgotten.""" + return any( + math.dist(position, seen) <= self._radius + and not self._expired(when, now) + for seen, when in self._visited + ) + + def mark_visited(self, position: Position, now: float = 0.0) -> None: + """Record ``position`` as greeted at ``now``.""" + self._visited.append( + ((float(position[0]), float(position[1])), float(now))) + + def unvisited( + self, candidates: list[Candidate], now: float = 0.0 + ) -> list[Candidate]: + """The candidates not greeted recently (within ``forget_after_s``).""" + return [c for c in candidates if not self.is_visited(c.position, now)] + + +def select_target( + candidates: list[Candidate], + registry: VisitedRegistry, + rng: random.Random, + now: float = 0.0, +) -> Candidate | None: + """Pick a random not-recently-greeted candidate, or None if there are none. + + Args: + candidates: Detected persons-on-chairs this tick (with floor positions). + registry: The visited memory used to filter out recently greeted ones. + rng: Source of randomness for the choice. + now: Current timestamp, for the registry's forget logic. + + Returns: + A randomly chosen candidate that has not been greeted within + ``forget_after_s``, or None when every candidate was greeted recently or + the list is empty (the greeter then keeps wandering). + """ + options = registry.unvisited(candidates, now) + if not options: + return None + return rng.choice(options) diff --git a/hackathon/pawtrack/tests/test_approach_geometry.py b/hackathon/pawtrack/tests/test_approach_geometry.py new file mode 100644 index 0000000000..33459002bc --- /dev/null +++ b/hackathon/pawtrack/tests/test_approach_geometry.py @@ -0,0 +1,72 @@ +"""Tests for the pure approach geometry helpers.""" + +import math + +import pytest + +from pawtrack.approach_geometry import ( + approach_velocity, + centering_yaw, + drive_to_position, + hold_distance_vx, +) + + +def test_centering_yaw_opposes_error_and_clamps(): + # Subject right of center (positive error) -> turn the other way (negative). + assert centering_yaw(0.5, 1.0, 0.8) == -0.5 + assert centering_yaw(-0.5, 1.0, 0.8) == 0.5 + assert centering_yaw(2.0, 1.0, 0.8) == -0.8 # clamped to the yaw limit + + +def test_hold_distance_vx_holds_with_deadband_and_safe_reverse(): + # Too far -> drive forward (clamped to the speed limit). + assert hold_distance_vx( + 3.0, 1.5, forward_gain=0.8, max_forward_mps=0.5) == 0.5 + # Inside the deadband around the standoff -> hold (no hunting in/out). + assert hold_distance_vx(1.55, 1.5, deadband_m=0.15) == 0.0 + assert hold_distance_vx(1.45, 1.5, deadband_m=0.15) == 0.0 + # Too close (beyond the deadband) -> back off to a safe gap, clamped. + assert hold_distance_vx( + 1.0, 1.5, deadband_m=0.15, forward_gain=0.8, max_reverse_mps=0.3) == -0.3 + + +def test_approach_velocity_holds_standoff_and_centers(): + vx, wz = approach_velocity( + 3.0, 1.5, 0.0, forward_gain=0.8, max_forward_mps=0.5) + assert vx == 0.5 and wz == 0.0 # far -> forward, centered + + vx, _ = approach_velocity(1.5, 1.5, 0.0) # at standoff + assert vx == 0.0 # hold + + vx, wz = approach_velocity( + 1.7, 1.5, 0.3, deadband_m=0.15, forward_gain=0.8, turn_gain=1.0, + max_yaw_radps=0.8) + assert vx == pytest.approx(0.8 * 0.2) # 0.2 > deadband -> ease forward + assert wz == pytest.approx(-0.3) # turn to recenter + + +def test_drive_to_position_faces_then_advances_to_standoff(): + # Target straight ahead (+x), robot facing +x: drive forward, no yaw. + vx, wz = drive_to_position( + (0.0, 0.0), 0.0, (3.0, 0.0), 1.0, forward_gain=0.8, max_forward_mps=0.5) + assert vx == 0.5 and wz == pytest.approx(0.0) # 0.8*(3-1)=1.6 -> clamp 0.5 + + # Target to the left, robot facing +x: turn toward it, do not lunge sideways. + vx, wz = drive_to_position( + (0.0, 0.0), 0.0, (0.0, 3.0), 1.0, turn_gain=1.0, max_yaw_radps=0.8, + heading_tol_rad=0.5) + assert vx == 0.0 # heading error ~pi/2 > tol -> turn in place first + assert wz == pytest.approx(0.8) # positive yaw toward +y, clamped + + # Inside the standoff already: hold (no forward), roughly facing. + vx, wz = drive_to_position((0.0, 0.0), 0.0, (0.5, 0.0), 1.0) + assert vx == 0.0 # range < standoff + + +def test_drive_to_position_wraps_heading_error(): + # Target behind the robot: should turn (yaw != 0), not drive forward. + vx, wz = drive_to_position( + (0.0, 0.0), 0.0, (-3.0, 0.001), 1.0, heading_tol_rad=0.5) + assert vx == 0.0 + assert abs(wz) > 0.0 and abs(math.pi - abs(wz / 1.0)) >= 0 # turning around diff --git a/hackathon/pawtrack/tests/test_greeter_agent_e2e.py b/hackathon/pawtrack/tests/test_greeter_agent_e2e.py new file mode 100644 index 0000000000..17bc0a090a --- /dev/null +++ b/hackathon/pawtrack/tests/test_greeter_agent_e2e.py @@ -0,0 +1,142 @@ +"""End-to-end agent test: does the LLM pick our greeting tools from a prompt? + +This drives the *real* agent stack (MCP server + LLM client + the greeter skill +container) exactly like a live run: it injects a natural-language prompt and +checks the agent's tool calls. So it needs a real LLM + API credentials and an +LCM/MCP bus, and is therefore skipped unless ``PAWTRACK_LLM_E2E=1`` is set. + + PAWTRACK_LLM_E2E=1 OPENAI_API_KEY=... PAWTRACK_MODEL=openai:deepseek-chat \ + OPENAI_BASE_URL=https://api.deepseek.com \ + PYTHONPATH=src pytest tests/test_greeter_agent_e2e.py -s + +It mirrors DimOS's own ``agent_setup`` harness (dimos/agents/conftest.py): build +``McpServer`` + ``McpClient`` + the skills + an ``AgentTestRunner`` that injects +messages, collect the agent's message history off ``/agent``, and assert on the +tool calls. The greeter loop's scan is disabled (``scan_interval_s`` huge) so +executing ``start_greeting`` does not load the vision models during the test. +""" + +from __future__ import annotations + +import os +from threading import Event +from typing import Any + +import pytest + +pytestmark = pytest.mark.skipif( + not os.environ.get("PAWTRACK_LLM_E2E"), + reason="needs a real LLM + creds; set PAWTRACK_LLM_E2E=1 to run.", +) + +from langchain_core.messages import HumanMessage # noqa: E402 + +from dimos.agents.agent_test_runner import AgentTestRunner # noqa: E402 +from dimos.agents.mcp.mcp_client import McpClient # noqa: E402 +from dimos.agents.mcp.mcp_server import McpServer # noqa: E402 +from dimos.core.coordination.blueprints import autoconnect # noqa: E402 +from dimos.core.coordination.module_coordinator import ( # noqa: E402 + ModuleCoordinator, +) +from dimos.core.core import rpc # noqa: E402 +from dimos.core.global_config import global_config # noqa: E402 +from dimos.core.module import Module # noqa: E402 +from dimos.core.transport import pLCMTransport # noqa: E402 + +from pawtrack.greeter_container import GreeterSkillContainer # noqa: E402 + +_MCP_URL = os.environ.get("MCP_URL", "http://localhost:9990/mcp") +_LCM_URL = os.environ.get("LCM_DEFAULT_URL", "udpm://239.255.76.67:7667?ttl=0") + + +class _FakePatrol(Module): + """Minimal stand-in for the PatrollingModule so the greeter's spec injects. + + The greeter container requires a patrolling module; on the robot that is the + nav stack. For this agent test we only care about tool selection, so a no-op + patrol that satisfies the spec is enough. + """ + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._patrolling = False + + @rpc + def start_patrol(self) -> str: + self._patrolling = True + return "Patrol started." + + @rpc + def is_patrolling(self) -> bool: + return self._patrolling + + @rpc + def stop_patrol(self) -> str: + self._patrolling = False + return "Patrol stopped." + + +def _run_agent(messages: list[HumanMessage]) -> list[Any]: + """Run the greeter agent over messages; return the agent's message history.""" + history: list[Any] = [] + finished = Event() + agent_t: pLCMTransport = pLCMTransport("/agent", url=_LCM_URL) + finished_t: pLCMTransport = pLCMTransport("/finished", url=_LCM_URL) + unsubs = [ + agent_t.subscribe(history.append), + finished_t.subscribe(lambda _msg: finished.set()), + ] + + # Omit system_prompt so the client uses its default general agent prompt -- + # exactly like the greeter-agentic blueprint, where greeting is just one tool. + model = os.environ.get("PAWTRACK_MODEL") + client_kwargs: dict[str, Any] = {"mcp_server_url": _MCP_URL} + if model: + client_kwargs["model"] = model + + blueprint = autoconnect( + _FakePatrol.blueprint(), + # Disable scanning so executing start_greeting does not load the VLM. + GreeterSkillContainer.blueprint(scan_interval_s=1e9), + McpServer.blueprint(), + McpClient.blueprint(**client_kwargs), + AgentTestRunner.blueprint(messages=messages), + ) + global_config.update(viewer="none") + coordinator = ModuleCoordinator.build(blueprint) + try: + if not finished.wait(120): + raise TimeoutError("agent did not finish processing the prompt") + finally: + coordinator.stop() + agent_t.stop() + finished_t.stop() + for unsub in unsubs: + unsub() + return history + + +def _tool_calls(history: list[Any], name: str) -> list[Any]: + return [ + call + for message in history + for call in (getattr(message, "tool_calls", None) or []) + if call["name"] == name + ] + + +def test_agent_selects_greeting_then_stop_from_prompts(): + history = _run_agent([ + HumanMessage( + "Walk around the room and say hi to everyone who is sitting down. " + "Don't ask me for confirmation, just start doing it." + ), + HumanMessage("Okay, that is enough for now -- stop greeting."), + ]) + + assert _tool_calls(history, "start_greeting"), ( + "the agent did not select start_greeting for a greeting request" + ) + assert _tool_calls(history, "stop_greeting"), ( + "the agent did not select stop_greeting when asked to stop" + ) diff --git a/hackathon/pawtrack/tests/test_greeter_container.py b/hackathon/pawtrack/tests/test_greeter_container.py new file mode 100644 index 0000000000..4678957bbe --- /dev/null +++ b/hackathon/pawtrack/tests/test_greeter_container.py @@ -0,0 +1,846 @@ +"""Orchestration tests for the greeter container. + +Drive ``_tick`` synchronously with fakes for the models, the patrol spec, the +GO2 connection, and TF -- no GPU, robot, or threads -- and assert the full +wander -> approach -> greet -> cooldown -> wander cycle, the patrol stop/resume, +the Hello/BalanceStand greeting, and the don't-revisit-too-soon rule. +""" + +import contextlib +import json +import math + +import numpy as np +import pytest + +from pawtrack import greeter_container +from pawtrack.greeter_container import GreeterSkillContainer +from pawtrack.greeter_state import ( + GreeterMachine, + GreeterObservation, + GreeterParams, +) +from pawtrack.ground_raycast import CameraIntrinsics +from pawtrack.visited_registry import Candidate + +# Bottom-center (470, 240); with the look-down transform + intrinsics below this +# back-projects to ~1.5 m away -- a safe standoff (>= min_safe, <= standoff+tol). +_SUBJECT_BBOX = (450.0, 200.0, 490.0, 240.0) + + +class _Recorder: + def __init__(self): + self.msgs = [] + + def publish(self, msg): + self.msgs.append(msg) + + +class _FakeImage: + width = 640 + height = 480 + ts = 0.0 + + def to_opencv(self): + return np.zeros((self.height, self.width, 3), dtype=np.uint8) + + +class _FakePatrol: + def __init__(self, patrolling=False): + self.patrolling = patrolling + self.calls = [] + + def start_patrol(self): + self.patrolling = True + self.calls.append("start") + return "started" + + def stop_patrol(self): + self.patrolling = False + self.calls.append("stop") + return "stopped" + + def is_patrolling(self): + return self.patrolling + + +class _FakeConnection: + def __init__(self): + self.requests = [] + + def publish_request(self, topic, data): + self.requests.append((topic, data)) + return {} + + +class _Det: + def __init__(self, bbox): + self.bbox = bbox + + def bbox_2d_volume(self): + x1, y1, x2, y2 = self.bbox + return (x2 - x1) * (y2 - y1) + + +class _Dets: + def __init__(self, detections): + self.detections = detections + + def __len__(self): + return len(self.detections) + + +class _FakeTracker: + def __init__(self, track_bbox=None): + self._track_bbox = track_bbox + self.inited = [] + + def init_track(self, image, box, obj_id=1): + self.inited.append(tuple(box)) + return [1] + + def process_image(self, _image): + if self._track_bbox is None: + return _Dets([]) + return _Dets([_Det(self._track_bbox)]) + + def stop(self): + pass + + +class _FakeVl: + def __init__(self, response, facing="back"): + self.response = response + self.facing = facing # answer to the front/back facing prompt + self.last_prompt = None + + def query(self, _image, prompt): + self.last_prompt = prompt + if "facing" in prompt.lower(): # the post-wave facing-detection query + return self.facing + return self.response + + def stop(self): + pass + + +class _FakeTransform: + def to_matrix(self): + # Camera 1 m up looking straight down (origin at world x=y=0). + return np.array([ + [1.0, 0.0, 0.0, 0.0], + [0.0, -1.0, 0.0, 0.0], + [0.0, 0.0, -1.0, 1.0], + [0.0, 0.0, 0.0, 1.0], + ]) + + +class _FakeTf: + def __init__(self, present=True): + self._present = present + + def get(self, _frame, _child, **_kwargs): + return _FakeTransform() if self._present else None + + +def _container(vl_response="[]", track_bbox=None, facing="back"): + c = GreeterSkillContainer() + c.cmd_vel = _Recorder() + c.debug_image = _Recorder() + c.greeter_phase = _Recorder() + c.subject_world_pose = _Recorder() + c._patrolling_module_spec = _FakePatrol() + c._connection = _FakeConnection() + c._intrinsics = CameraIntrinsics(fx=100.0, fy=100.0, cx=320.0, cy=240.0) + c._tf = _FakeTf() + c._latest_image = _FakeImage() + c._vl_model = _FakeVl(vl_response, facing) + c._tracker = _FakeTracker(track_bbox) + c.config.recover_settle_s = 0.0 # no real RecoveryStand->BalanceStand wait + # Short timers so the timed phases advance within the test. + c._machine = GreeterMachine(GreeterParams( + standoff_m=1.5, standoff_tolerance_m=0.3, + greet_duration_s=0.5, cooldown_duration_s=0.5, + )) + return c + + +class _FakeVec: + def __init__(self, x, y): + self.x = x + self.y = y + + +class _FakeOdom: + """Minimal PoseStamped stand-in: position (x, y) + yaw (rad).""" + + def __init__(self, x, y, yaw=0.0): + self.position = _FakeVec(x, y) + self.yaw = yaw + + +class _FakeOrigin: + def __init__(self, x, y): + self.position = _FakeVec(x, y) + + +class _FakeCostmap: + """Minimal OccupancyGrid stand-in: grid[y][x] costs, resolution, origin.""" + + def __init__(self, grid, resolution, origin_xy): + self.grid = grid + self.resolution = resolution + self.origin = _FakeOrigin(*origin_xy) + + +def _sport_ids(connection): + return [data["api_id"] for _topic, data in connection.requests] + + +def test_wander_starts_patrol_and_stays_without_a_target(): + c = _container(vl_response="[]") # nothing detected + c._tick(2.0) # past the first scan interval + assert c._patrolling_module_spec.is_patrolling() + assert c._machine.state == "wander" + assert c.cmd_vel.msgs == [] # wander leaves driving to the patrol + + +def test_wander_rescans_when_all_visible_people_are_already_greeted(): + # The trap: the dog sees a person but already greeted them, so it won't + # re-engage -- and the patrol is goal-less and blocked by that person. It + # must rotate to look elsewhere instead of freezing. + c = _container(vl_response='[{"bbox": [450, 200, 490, 240]}]') + seen = c._scan(c._latest_image) # where does the detected person land? + assert seen # a person is visible + c._registry.mark_visited(seen[0].position) # ... and already greeted + c._patrolling_module_spec.patrolling = True # patrol was (fruitlessly) roaming + + c.cmd_vel.msgs.clear() + c._tick(2.0) # wander: sees only the visited person + + assert c._machine.state == "wander" + assert c._rescanning + last = c.cmd_vel.msgs[-1] + assert last.angular.z != 0.0 and last.linear.x == 0.0 # rotating in place + assert "stop" in c._patrolling_module_spec.calls # took over from the patrol + assert not c._patrolling_module_spec.is_patrolling() + + +def test_wander_does_not_rescan_when_no_one_is_visible(): + c = _container(vl_response="[]") # empty room in view + c.cmd_vel.msgs.clear() + c._tick(2.0) + assert c._machine.state == "wander" + assert c.cmd_vel.msgs == [] # no rescan; leave roaming to the patrol + assert c._patrolling_module_spec.is_patrolling() + assert not c._rescanning + + +def test_wander_rescan_is_bounded_and_hands_back_to_patrol(): + c = _container(vl_response='[{"bbox": [450, 200, 490, 240]}]') + seen = c._scan(c._latest_image) + c._registry.mark_visited(seen[0].position) + c._rescanning = True + c._rescan_started_s = 0.0 # already rotating since t=0 + + c._tick(c.config.wander_rescan_max_s + 1.0) # past the rescan budget + + assert not c._rescanning # gave the rotation a fair shot + assert c._patrolling_module_spec.is_patrolling() # handed back to the patrol + + +def test_full_cycle_and_do_not_revisit(): + c = _container( + vl_response='[{"bbox": [450, 200, 490, 240]}]', + track_bbox=_SUBJECT_BBOX, + ) + + c._tick(2.0) # wander: detect + select -> approach + assert c._machine.state == "approach" + assert "stop" in c._patrolling_module_spec.calls # patrol paused to engage + assert c._tracker.inited # tracker locked onto the chosen subject + + c._tick(2.1) # approach: at a safe standoff (~1.5 m) -> greet, any facing + assert c._machine.state == "greet" + assert _sport_ids(c._connection)[-1] == 1016 # Hello fired once + assert c.cmd_vel.msgs[-1].is_zero() # halted before the wave, not still moving + + c._tick(2.8) # greet timer elapsed -> cooldown + assert c._machine.state == "cooldown" + assert 1006 in _sport_ids(c._connection) # RecoveryStand after the gesture + assert c._registry.is_visited((1.5, 0.0), now=2.8) # subject recorded + + c._tick(3.4) # cooldown timer elapsed -> wander + assert c._machine.state == "wander" + assert 1002 in _sport_ids(c._connection) # BalanceStand before patrol resumes + # The full recovery sequence ran in order: Hello -> RecoveryStand -> Balance. + assert _sport_ids(c._connection) == [1016, 1006, 1002] + + # The just-greeted person is still within the forget window: detected again + # but not re-selected. + stop_count_before = c._patrolling_module_spec.calls.count("stop") + c._tick(5.0) + assert c._machine.state == "wander" + assert c._patrolling_module_spec.calls.count("stop") == stop_count_before + + +def test_subject_is_re_greeted_after_the_forget_window(): + # The relaxed same-person rule: once revisit_forget_s passes, the same seated + # person becomes greet-eligible again instead of being skipped for the run. + c = _container( + vl_response='[{"bbox": [450, 200, 490, 240]}]', track_bbox=_SUBJECT_BBOX) + seen = c._scan(c._latest_image) + c._registry.mark_visited(seen[0].position, now=0.0) # greeted at t=0 + + c._tick(30.0) # within revisit_forget_s (60 s default) -> still skipped + assert c._machine.state == "wander" + + c._tick(70.0) # past the forget window -> re-engage the same person + assert c._machine.state == "approach" + + +def test_greets_with_a_heart_when_the_subject_faces_the_camera(): + c = _container( + vl_response='[{"bbox": [450, 200, 490, 240]}]', + track_bbox=_SUBJECT_BBOX, facing="front") + c.config.heart_duration_s = 0.0 # no real wait in the test + c._tick(2.0) # wander -> approach + c._tick(2.1) # approach -> greet (Hello) + c._tick(2.8) # greet -> cooldown: facing=front -> FingerHeart + + ids = _sport_ids(c._connection) + heart = greeter_container._FINGER_HEART_API_ID + assert heart in ids # rewarded the facing subject + # ... after the wave and before recovery: Hello -> FingerHeart -> RecoveryStand + assert ids.index(1016) < ids.index(heart) < ids.index(1006) + + +def test_no_heart_when_the_subject_faces_away(): + c = _container( + vl_response='[{"bbox": [450, 200, 490, 240]}]', + track_bbox=_SUBJECT_BBOX, facing="back") + c.config.heart_duration_s = 0.0 + c._tick(2.0) + c._tick(2.1) + c._tick(2.8) # facing away -> no heart, just move on + assert greeter_container._FINGER_HEART_API_ID not in _sport_ids(c._connection) + + +def test_facing_failure_still_recovers_and_marks_visited(): + # If the post-wave VLM facing call blows up, the dog must still recover and + # mark the subject visited -- the cooldown effects run once on the edge, so a + # raised exception would otherwise freeze it after Hello and let it re-greet + # the same subject. + c = _container( + vl_response='[{"bbox": [450, 200, 490, 240]}]', track_bbox=_SUBJECT_BBOX) + c.config.heart_duration_s = 0.0 + + def query(_image, prompt): + if "facing" in prompt.lower(): + raise RuntimeError("VLM timeout") + return '[{"bbox": [450, 200, 490, 240]}]' + + c._vl_model.query = query + + c._tick(2.0) # wander -> approach + c._tick(2.1) # approach -> greet + c._tick(2.8) # greet -> cooldown: facing query raises, must not abort cleanup + + assert c._machine.state == "cooldown" + assert 1006 in _sport_ids(c._connection) # RecoveryStand still sent + assert c._registry.is_visited((1.5, 0.0), now=2.8) # subject still marked + + +def test_approach_drives_by_ground_distance_not_a_width_model(): + c = _container() + far = c._approach_twist(_SUBJECT_BBOX, 640, distance_m=3.0) + assert far.linear.x > 0.0 # far -> close the distance + too_close = c._approach_twist(_SUBJECT_BBOX, 640, distance_m=0.8) + assert too_close.linear.x < 0.0 # too close -> back off to a safe gap + blind = c._approach_twist(_SUBJECT_BBOX, 640, distance_m=None) + assert blind.linear.x == 0.0 # no ground fix -> face only, do not lunge + + +def test_lost_subject_deadreckons_to_last_known_not_bail(): + # On track loss, the dog drives (odom) toward the subject's last-known floor + # position instead of bailing -- it stays engaged while still en route. + c = _container( + vl_response='[{"bbox": [450, 200, 490, 240]}]', track_bbox=_SUBJECT_BBOX) + c._tick(2.0) # -> approach; last_position = (1.5, 0) + assert c._machine.state == "approach" + + c._tracker = _FakeTracker(track_bbox=None) # subject vanishes + c._latest_odom = _FakeOdom(5.0, 0.0, yaw=math.pi) # far, facing the target + c.cmd_vel.msgs.clear() + c._tick(2.5) + + assert c._machine.state == "approach" # did not abandon on loss + assert not c.cmd_vel.msgs[-1].is_zero() # driving toward last-known, not frozen + + +def test_lost_subject_gives_up_after_arriving_and_not_re_detecting(): + c = _container( + vl_response='[{"bbox": [450, 200, 490, 240]}]', track_bbox=_SUBJECT_BBOX) + c._tick(2.0) # -> approach; last_position = (1.5, 0) + + c._tracker = _FakeTracker(track_bbox=None) # vanishes + c._latest_odom = _FakeOdom(3.0, 0.0) # within standoff of (1.5, 0): arrived + c._vl_model.response = "[]" # ... and it is no longer detectable there + c._tick(4.0) # past scan interval -> re-detect runs, finds nothing -> give up + + assert c._machine.state == "wander" + assert c._last_snapshot.reason == "lost" + + +def test_lost_subject_re_acquires_at_last_known_and_greets(): + c = _container( + vl_response='[{"bbox": [450, 200, 490, 240]}]', track_bbox=_SUBJECT_BBOX) + c._tick(2.0) # -> approach; last_position = (1.5, 0) + + c._tracker = _FakeTracker(track_bbox=None) # vanishes mid-approach + c._latest_odom = _FakeOdom(3.0, 0.0) # arrived at the last-known spot + inited_before = len(c._tracker.inited) + c._tick(4.0) # re-detect finds the chair there -> re-acquire + wave + + assert len(c._tracker.inited) > inited_before # tracker re-seeded + assert c._machine.state == "greet" # re-acquired at standoff -> greet + assert _sport_ids(c._connection)[-1] == 1016 # Hello fired + + +def test_stuck_engagement_returns_to_patrol_and_skips_subject(): + # A subject we can never reach (always beyond standoff) must not hang the + # loop: the engage timeout bails to wander and skips that subject. + far_bbox = (500.0, 200.0, 560.0, 240.0) # bottom-center -> ~2.1 m away + c = _container( + vl_response='[{"bbox": [500, 200, 560, 240]}]', track_bbox=far_bbox) + c._machine = GreeterMachine(GreeterParams( + standoff_m=1.5, standoff_tolerance_m=0.3, + engage_timeout_s=1.0, greet_duration_s=0.5, cooldown_duration_s=0.5, + )) + + c._tick(2.0) # wander -> approach (far subject selected) + assert c._machine.state == "approach" + c._tick(2.5) # too far to reach standoff, still within the timeout + assert c._machine.state == "approach" + c._tick(3.1) # engagement stalled past engage_timeout_s -> wander + assert c._machine.state == "wander" + assert c._registry.is_visited((2.1, 0.0)) # the unreachable subject is skipped + assert c.cmd_vel.msgs[-1].is_zero() # no stale drive command into wander + + +def test_start_and_stop_greeting_skill(): + c = _container(vl_response="[]") + started = c.start_greeting() + assert "start" in started.lower() + stopped = c.stop_greeting() + assert "stop" in stopped.lower() + assert c.cmd_vel.msgs and c.cmd_vel.msgs[-1].is_zero() + assert "stopped" in c.greeter_phase.msgs[-1].lower() # stop phase published + + +def test_start_greeting_accepts_a_target_override(): + # The target is parameterizable so the same loop can greet "a chair" in sim + # (no seated person there) and "a person sitting on a chair" on the robot. + c = _container(vl_response="[]") + c.start_greeting(target="a chair") + assert c._description == "a chair" # overrides the configured default + c.stop_greeting() + c.start_greeting() # omitted -> back to the configured subject + assert c._description == c.config.description + c.stop_greeting() + + +def test_scan_queries_the_active_target(): + c = _container(vl_response="[]") + c._description = "a chair" + c._scan(_FakeImage()) + assert "a chair" in c._vl_model.last_prompt # VLM asked for the active target + + +def test_module_stop_halts_the_robot(): + c = _container() + c._patrolling_module_spec.patrolling = True + # stop() halts before tearing down; the full DimOS teardown that follows + # needs more wiring than the fakes provide, so suppress just that part. + with contextlib.suppress(Exception): + c.stop() # DimOS shutdown path, not the stop_greeting skill + assert c.cmd_vel.msgs[-1].is_zero() + assert not c._patrolling_module_spec.is_patrolling() + + +def test_halt_stops_patrol_even_if_velocity_publish_fails(): + c = _container() + c._patrolling_module_spec.patrolling = True + + class _Raising: + def publish(self, _msg): + raise RuntimeError("publish boom") + + c.cmd_vel = _Raising() + c._halt() # must not raise, and must still stop the patrol + assert not c._patrolling_module_spec.is_patrolling() + + +def test_reset_engagement_clears_transient_state_but_keeps_registry(): + c = _container() + c._machine.step( # drive into a mid-engagement phase + GreeterObservation(target_acquired=True, subject_visible=True), 0.0) + c._chosen = Candidate(position=(1.0, 0.0), bbox=(0.0, 0.0, 10.0, 10.0)) + c._last_position = (1.0, 0.0) + c._registry.mark_visited((9.0, 9.0)) # greeted in an earlier run + + c._reset_engagement() + + assert c._machine.state == "wander" + assert c._chosen is None and c._last_position is None + assert c._last_snapshot is None + assert c._registry.is_visited((9.0, 9.0)) # kept across restarts + + +def test_status_trace_and_world_pose_explain_the_engagement(): + far_bbox = (500.0, 200.0, 560.0, 240.0) # ~2.1 m -> stays approaching + c = _container( + vl_response='[{"bbox": [500, 200, 560, 240]}]', track_bbox=far_bbox) + c._tick(2.0) # wander -> approach (select a subject ~2.1 m away) + c._tick(2.1) # approach: track + locate (still too far to greet) + + trace = json.loads(c.greeter_status()) + assert trace["state"] == "approach" + assert trace["subject_visible"] is True + assert trace["distance_m"] is not None + for key in ("patrolling", "subject_xy", "greeted", "reason"): + assert key in trace + assert "is_front" not in trace # facing is gone from the trace + assert c.subject_world_pose.msgs # 3D marker published while engaging + + +def test_wave_hello_halts_then_waves_then_recovers(): + c = _container() + c.config.greet_duration_s = 0.0 # no waits in the test + c.config.recover_settle_s = 0.0 + c._patrolling_module_spec.patrolling = True # dog was wandering when asked + + result = c.wave_hello() + + assert "wave" in result.lower() + assert not c._patrolling_module_spec.is_patrolling() # patrol stopped to wave + assert c.cmd_vel.msgs[-1].is_zero() # stood still to wave + # Hello, then recover: RecoveryStand -> BalanceStand so it can move again. + assert _sport_ids(c._connection) == [1016, 1006, 1002] + + +def test_wave_hello_stops_a_running_loop_before_waving(): + # If the autonomous loop is running, wave_hello must stop it (and join the + # tick thread) first, so no tick restarts the patrol or drives the dog mid-wave. + c = _container(vl_response="[]") + c.config.greet_duration_s = 0.0 + c.config.recover_settle_s = 0.0 + c.start_greeting() + assert c._thread is not None and c._thread.is_alive() + + c.wave_hello() + + assert c._thread is None # the loop was stopped before the gesture + assert _sport_ids(c._connection) == [1016, 1006, 1002] + + +def test_scan_clamps_off_frame_boxes_before_locating(): + # A VLM box that spills off the right edge is valid (it overlaps the frame) + # but must be clamped before raycasting, or its off-image bottom-center + # locates to the wrong floor point. + c = _container(vl_response='[{"bbox": [600, 200, 700, 240]}]') + + candidates = c._scan(_FakeImage()) + + assert len(candidates) == 1 + assert candidates[0].bbox[2] == 640.0 # clamped to the frame width + # Clamped bottom-center (620, 240) -> floor x = (620-320)/100 = 3.0. + assert candidates[0].position == pytest.approx((3.0, 0.0)) + + +def test_scan_drops_a_candidate_on_a_mapped_wall(): + # The reflection case: the VLM box raycasts onto the floor at (1.5, 0.0), + # but the costmap marks that spot a wall (a person seen reflected in glass). + c = _container(vl_response='[{"bbox": [450, 200, 490, 240]}]') + assert len(c._scan(_FakeImage())) == 1 # no costmap yet -> kept + + res = 0.1 + wall = [[0] * 11 for _ in range(11)] + wall[5][5] = 100 # lethal cell at the candidate's floor position + c._latest_costmap = _FakeCostmap(wall, res, (1.5 - 5 * res, 0.0 - 5 * res)) + assert c._scan(_FakeImage()) == [] # dropped: on a mapped wall + + free = [[0] * 11 for _ in range(11)] + c._latest_costmap = _FakeCostmap(free, res, (1.5 - 5 * res, 0.0 - 5 * res)) + assert len(c._scan(_FakeImage())) == 1 # free space at the same spot -> kept + + +def test_obstacle_filter_can_be_disabled(): + c = _container(vl_response='[{"bbox": [450, 200, 490, 240]}]') + wall = [[100] * 11 for _ in range(11)] + c._latest_costmap = _FakeCostmap(wall, 0.1, (1.0, -0.5)) + c.config.reject_on_mapped_obstacle = False + assert len(c._scan(_FakeImage())) == 1 # filter off -> kept despite the wall + + +def test_scan_drops_a_candidate_beyond_the_distance_bound(): + # A box near the image horizon ranges to a far floor point; such a phantom + # is dropped before the dog commits to it and walks at empty space. + c = _container(vl_response='[{"bbox": [450, 200, 490, 240]}]') + c._locate = lambda box, image: ((50.0, 0.0), 50.0) # 50 m ranging artifact + assert c._scan(_FakeImage()) == [] # 50 m > the 15 m bound -> dropped + c.config.max_subject_distance_m = 100.0 + assert len(c._scan(_FakeImage())) == 1 # within a looser bound -> kept + + +def test_roams_forward_when_odom_stalls_in_wander(): + # Patrol is goal-less and the dog is not moving: detect the stall from odom + # and take over with a forward roam into new space. + c = _container(vl_response="[]") + c._latest_odom = _FakeOdom(0.0, 0.0) + c.config.stall_timeout_s = 1.0 + c.config.roam_turn_s = 0.0 # skip the reorient phase -> straight to forward + c._patrolling_module_spec.patrolling = True + c._wander_tick(c._latest_image, 0.0) # establish the anchor (not yet stalled) + assert not c._roaming + + c.cmd_vel.msgs.clear() + c._wander_tick(c._latest_image, 2.0) # no movement for 2 s > 1 s -> roam + + assert c._roaming + last = c.cmd_vel.msgs[-1] + assert last.linear.x > 0.0 # driving forward into new space + assert not c._patrolling_module_spec.is_patrolling() # took over the patrol + + +def test_roam_reorients_in_place_before_driving_forward(): + c = _container(vl_response="[]") + c._latest_odom = _FakeOdom(0.0, 0.0) + c.config.stall_timeout_s = 1.0 + c.config.roam_turn_s = 2.0 + c._wander_tick(c._latest_image, 0.0) + c.cmd_vel.msgs.clear() + c._wander_tick(c._latest_image, 2.0) # enters roam; within the turn phase + turn = c.cmd_vel.msgs[-1] + assert turn.angular.z != 0.0 and turn.linear.x == 0.0 # reorient first + + +def test_roam_hands_back_to_patrol_after_escaping(): + c = _container(vl_response="[]") + c._latest_odom = _FakeOdom(0.0, 0.0) + c.config.stall_timeout_s = 1.0 + c.config.roam_turn_s = 0.0 + c.config.roam_escape_distance_m = 0.5 + c._wander_tick(c._latest_image, 0.0) + c._wander_tick(c._latest_image, 2.0) # stalled -> roaming + assert c._roaming + + c._latest_odom = _FakeOdom(1.0, 0.0) # escaped 1 m > 0.5 m + c._wander_tick(c._latest_image, 2.5) + + assert not c._roaming + assert c._patrolling_module_spec.is_patrolling() # patrol resumed + + +def test_no_roam_while_making_progress(): + c = _container(vl_response="[]") + c._latest_odom = _FakeOdom(0.0, 0.0) + c.config.stall_timeout_s = 1.0 + c._wander_tick(c._latest_image, 0.0) + c._latest_odom = _FakeOdom(0.0, 0.5) # moved 0.5 m (> 0.3 threshold) + c._wander_tick(c._latest_image, 2.0) + assert not c._roaming # movement resets the stall timer + + +def test_no_roam_without_odom(): + c = _container(vl_response="[]") + c._latest_odom = None # no odom -> never drive blind + c.config.stall_timeout_s = 1.0 + c._wander_tick(c._latest_image, 0.0) + c.cmd_vel.msgs.clear() + c._wander_tick(c._latest_image, 5.0) + assert not c._roaming + assert c._patrolling_module_spec.is_patrolling() # patrol owns motion + + +def test_wander_rescans_when_every_detection_is_filtered_out(): + # The freeze case: the dog faces a wall/reflection -- it detects a "person" + # but the costmap filter drops it, leaving no candidate. It must rotate to a + # new heading rather than sit frozen (the patrol is usually goal-less here). + c = _container(vl_response='[{"bbox": [450, 200, 490, 240]}]') + wall = [[100] * 11 for _ in range(11)] # candidate raycasts to (1.5, 0.0) + c._latest_costmap = _FakeCostmap(wall, 0.1, (1.0, -0.5)) + c._patrolling_module_spec.patrolling = True + c.cmd_vel.msgs.clear() + + c._tick(2.0) # past the first scan interval + + assert c._last_scan_dropped > 0 # the detection was filtered out + assert c._rescanning # rotating to look elsewhere, not frozen + last = c.cmd_vel.msgs[-1] + assert last.angular.z != 0.0 and last.linear.x == 0.0 # in-place yaw + assert not c._patrolling_module_spec.is_patrolling() # took over the patrol + + +def test_halt_zeros_velocity_and_stops_patrol(): + c = _container() + c._patrolling_module_spec.patrolling = True + c._halt() + assert c.cmd_vel.msgs[-1].is_zero() + assert not c._patrolling_module_spec.is_patrolling() + + +def test_loop_fails_safe_and_stops_after_repeated_errors(): + c = _container(vl_response="[]") + c._patrolling_module_spec.patrolling = True + c.config.max_consecutive_errors = 2 + c.config.loop_hz = 50.0 + + def boom(_now): + raise RuntimeError("tick boom") + + c._tick = boom # every tick fails + c.start_greeting() + c._thread.join(timeout=2.0) + + assert not c._thread.is_alive() # gave up rather than spinning + assert c.cmd_vel.msgs[-1].is_zero() # halted + assert not c._patrolling_module_spec.is_patrolling() # patrol stopped + assert c._last_snapshot is not None and c._last_snapshot.reason == "error" + + +def test_stop_greeting_also_stops_the_patrol(): + # In wander the patrol owns motion, so stopping the loop alone leaves the + # dog driving -- stop_greeting must stop the patrol too. + c = _container(vl_response="[]") + c._patrolling_module_spec.patrolling = True # robot mid-patrol + c.stop_greeting() + assert not c._patrolling_module_spec.is_patrolling() + assert "stop" in c._patrolling_module_spec.calls + assert c.cmd_vel.msgs[-1].is_zero() + + +class _FakeCenter: + def __init__(self, x, y): + self.x = x + self.y = y + + +class _FakeDetection3D: + def __init__(self, center): + self.center = center + + +class _FakePointCloud: + frame_id = "world" + + def __init__(self, n_points=10): + self._points = np.zeros((n_points, 3)) + + def as_numpy(self): + return self._points, None + + +def test_locate_prefers_lidar_pointcloud_over_raycast(monkeypatch): + # With a non-empty lidar cloud present, _locate ranges from the projected + # points (here faked to centre on (1.5, 0)) not the flat-floor raycast. + c = _container() + c._latest_pointcloud = _FakePointCloud(10) # from_2d itself is faked below + monkeypatch.setattr( + greeter_container.Detection3DPC, "from_2d", + lambda *a, **k: _FakeDetection3D(_FakeCenter(1.5, 0.0)), + ) + # _FakeTf puts the camera at world x=y=0, so the distance is just |centre|. + assert c._locate(_SUBJECT_BBOX, _FakeImage()) == ((1.5, 0.0), 1.5) + + +def test_locate_falls_back_to_raycast_without_lidar(): + c = _container() + assert c._latest_pointcloud is None + assert c._locate(_SUBJECT_BBOX, _FakeImage()) == c._locate_raycast( + _SUBJECT_BBOX, _FakeImage()) + + +def test_locate_falls_back_to_raycast_when_lidar_finds_no_points(monkeypatch): + c = _container() + c._latest_pointcloud = _FakePointCloud(10) + monkeypatch.setattr( + greeter_container.Detection3DPC, "from_2d", lambda *a, **k: None) + assert c._locate(_SUBJECT_BBOX, _FakeImage()) == c._locate_raycast( + _SUBJECT_BBOX, _FakeImage()) + + +def test_locate_falls_back_to_raycast_when_lidar_cloud_is_empty(): + c = _container() + c._latest_pointcloud = _FakePointCloud(0) # cloud arrives but has no points + assert c._locate(_SUBJECT_BBOX, _FakeImage()) == c._locate_raycast( + _SUBJECT_BBOX, _FakeImage()) + + +class _FlakyConnection: + def __init__(self, fail_times, error=None): + self.fail_times = fail_times + self.error = error or Exception("Data channel is not open") + self.calls = 0 + + def publish_request(self, _topic, _data): + self.calls += 1 + if self.calls <= self.fail_times: + raise self.error + return {} + + +def test_send_sport_retries_a_transient_closed_channel_then_succeeds(monkeypatch): + monkeypatch.setattr(greeter_container.time, "sleep", lambda _s: None) + c = _container() + c._connection = _FlakyConnection(fail_times=2) # opens on the 3rd attempt + assert c._send_sport(greeter_container._HELLO_API_ID) is True + assert c._connection.calls == 3 + + +def test_send_sport_gives_up_without_crashing_when_channel_stays_closed( + monkeypatch, +): + monkeypatch.setattr(greeter_container.time, "sleep", lambda _s: None) + c = _container() + c._connection = _FlakyConnection(fail_times=99) # channel never reopens + assert c._send_sport(greeter_container._HELLO_API_ID) is False + assert c._connection.calls == c.config.sport_retry_attempts + + +def test_send_sport_reraises_unexpected_errors(monkeypatch): + # A non-channel error is a real bug -> surface it to the loop fail-safe. + monkeypatch.setattr(greeter_container.time, "sleep", lambda _s: None) + c = _container() + c._connection = _FlakyConnection(fail_times=99, error=ValueError("boom")) + with pytest.raises(ValueError): + c._send_sport(greeter_container._HELLO_API_ID) + + +def test_send_sport_without_a_connection_is_a_no_op(): + c = _container() + c._connection = None + assert c._send_sport(greeter_container._HELLO_API_ID) is False + + +def test_search_twist_yaws_toward_last_seen_side_not_frozen(): + c = _container() + c._last_error_x = 0.8 # subject was last seen to the right + right = c._search_twist() + c._last_error_x = -0.8 # ... to the left + left = c._search_twist() + assert right.linear.x == 0.0 and right.linear.y == 0.0 # never drives blind + assert right.angular.z != 0.0 # yaws to re-find rather than freezing + assert right.angular.z == -left.angular.z # opposite side -> opposite yaw + c._last_error_x = 0.0 + assert c._search_twist().angular.z == 0.0 # nothing to re-centre toward + + +def test_engage_tick_searches_instead_of_freezing_when_track_lost(): + # Tracker returns nothing (track_bbox=None) mid-approach: the dog should yaw + # to re-find the subject, not publish a dead stop. + c = _container(track_bbox=None) + c._last_error_x = 0.8 + obs = c._engage_tick(_FakeImage()) + assert obs.subject_visible is False + assert not c.cmd_vel.msgs[-1].is_zero() # searching, not frozen diff --git a/hackathon/pawtrack/tests/test_greeter_state.py b/hackathon/pawtrack/tests/test_greeter_state.py new file mode 100644 index 0000000000..61d876f4f4 --- /dev/null +++ b/hackathon/pawtrack/tests/test_greeter_state.py @@ -0,0 +1,142 @@ +"""Tests for the pure greeter phase machine.""" + +import json + +from pawtrack.greeter_state import ( + GreeterMachine, + GreeterObservation, + GreeterParams, +) + + +def _params(): + return GreeterParams( + standoff_m=1.5, standoff_tolerance_m=0.3, + greet_duration_s=2.0, cooldown_duration_s=4.0, + ) + + +def _obs(target=False, visible=False, distance=None, reached=False): + return GreeterObservation( + target_acquired=target, subject_visible=visible, distance_m=distance, + reached_destination=reached, + ) + + +def test_starts_wandering(): + assert GreeterMachine().state == "wander" + + +def test_a_visible_subject_alone_does_not_leave_wander(): + # Only a selected (random, unvisited) target leaves wander, not mere + # visibility -- the container does the selection. + m = GreeterMachine(_params()) + assert m.step(_obs(visible=True, distance=1.0), 0.0).state == "wander" + + +def test_full_cycle_wander_to_greet_and_back(): + m = GreeterMachine(_params()) # min_safe 1.0, greet window [1.0, 1.8] + assert m.step(_obs(), 0.0).state == "wander" + + acquired = m.step(_obs(target=True, visible=True, distance=4.0), 1.0) + assert acquired.state == "approach" and acquired.entered + + assert m.step(_obs(visible=True, distance=3.0), 1.5).state == "approach" + + greet = m.step(_obs(visible=True, distance=1.5), 2.0) # at a safe standoff + assert greet.state == "greet" and greet.entered # wave, no facing required + + assert m.step(_obs(visible=True, distance=1.5), 2.5).state == "greet" + + cool = m.step(_obs(visible=True, distance=1.5), 4.0) + assert cool.state == "cooldown" and cool.entered # greet_duration elapsed + + assert m.step(_obs(), 7.9).state == "cooldown" # still cooling + back = m.step(_obs(), 8.0) # cooldown elapsed + assert back.state == "wander" and back.entered # resume patrol + + +def test_approach_needs_distance_to_greet(): + m = GreeterMachine(_params()) + m.step(_obs(target=True, visible=True, distance=4.0), 0.0) + assert m.step(_obs(visible=True, distance=None), 0.5).state == "approach" + + +def test_approach_holds_off_greeting_until_a_safe_distance(): + m = GreeterMachine(_params()) # min_safe_distance_m default 1.0 + m.step(_obs(target=True, visible=True, distance=1.4), 0.0) # approach + # Within standoff but too close (0.6 < 1.0): do not wave, keep approaching. + assert m.step(_obs(visible=True, distance=0.6), 0.1).state == "approach" + # Backed out to a safe standoff -> greet. + assert m.step(_obs(visible=True, distance=1.4), 0.2).state == "greet" + + +def test_greets_immediately_if_already_at_a_safe_distance(): + m = GreeterMachine(_params()) + # Acquired a target already within the greet window -> approach then greet. + m.step(_obs(target=True, visible=True, distance=1.4), 0.0) # approach + assert m.step(_obs(visible=True, distance=1.4), 0.1).state == "greet" + + +def test_losing_the_subject_keeps_approaching_not_bailing(): + # A track loss no longer abandons the engagement: the container dead-reckons + # to the last-known spot, so the machine stays in approach while not visible + # and not yet arrived. + m = GreeterMachine(_params()) + m.step(_obs(target=True, visible=True, distance=4.0), 0.0) + assert m.step(_obs(visible=False, distance=3.0), 1.0).state == "approach" + assert m.step(_obs(visible=False, distance=2.0), 5.0).state == "approach" + + +def test_arriving_at_last_known_without_seeing_subject_gives_up(): + m = GreeterMachine(_params()) + m.step(_obs(target=True, visible=True, distance=4.0), 0.0) # approach + # Drove to the last-known spot (reached) but still cannot see it -> give up. + gone = m.step(_obs(visible=False, distance=0.2, reached=True), 2.0) + assert gone.state == "wander" and gone.reason == "lost" + + +def test_does_not_greet_a_lost_subject_even_at_standoff(): + # At standoff distance but not visible (lost): must not wave -- the greeting + # only fires once the subject is (re)detected. + m = GreeterMachine(_params()) + m.step(_obs(target=True, visible=True, distance=4.0), 0.0) # approach + assert m.step(_obs(visible=False, distance=1.5), 1.0).state == "approach" + # Re-detected at standoff -> greet. + assert m.step(_obs(visible=True, distance=1.5), 1.5).state == "greet" + + +def test_approach_times_out_to_wander_marked_stuck(): + m = GreeterMachine(GreeterParams( + standoff_m=1.5, standoff_tolerance_m=0.3, engage_timeout_s=5.0)) + m.step(_obs(target=True, visible=True, distance=4.0), 0.0) # approach + assert m.step(_obs(visible=True, distance=4.0), 4.9).state == "approach" + stuck = m.step(_obs(visible=True, distance=4.0), 5.1) # stalled too long + assert stuck.state == "wander" and stuck.reason == "stuck" + + +def test_greet_and_cooldown_ignore_visibility(): + m = GreeterMachine(_params()) + m.step(_obs(target=True, visible=True, distance=1.4), 0.0) # approach + m.step(_obs(visible=True, distance=1.4), 0.1) # greet at 0.1 + assert m.step(_obs(visible=False), 1.0).state == "greet" + assert m.step(_obs(visible=False), 2.2).state == "cooldown" + + +def test_reset_returns_to_clean_wander(): + m = GreeterMachine(_params()) + m.step(_obs(target=True, visible=True, distance=4.0), 0.0) # -> approach + assert m.state == "approach" + m.reset() + assert m.state == "wander" + # Clean start: only a freshly acquired target leaves wander, not old phase. + assert m.step(_obs(visible=True, distance=1.0), 1.0).state == "wander" + + +def test_snapshot_json_round_trips(): + m = GreeterMachine(_params()) + snap = m.step(_obs(target=True, visible=True, distance=4.0), 1.0) + data = json.loads(snap.to_json()) + assert data["state"] == "approach" + assert data["entered"] is True + assert "message" in data diff --git a/hackathon/pawtrack/tests/test_ground_raycast.py b/hackathon/pawtrack/tests/test_ground_raycast.py new file mode 100644 index 0000000000..d4eb8891cb --- /dev/null +++ b/hackathon/pawtrack/tests/test_ground_raycast.py @@ -0,0 +1,88 @@ +"""Tests for the pure pixel-to-ground raycast.""" + +import numpy as np +import pytest + +from pawtrack.ground_raycast import CameraIntrinsics, pixel_to_ground_point + +# A camera 1 m above the floor looking straight down. Optical +z (view dir) +# maps to map -z (down); a 180-degree roll about x keeps the frame +# right-handed (optical +y -> map -y), so det(R) == +1. +_LOOK_DOWN = np.array( + [ + [1.0, 0.0, 0.0, 0.0], + [0.0, -1.0, 0.0, 0.0], + [0.0, 0.0, -1.0, 1.0], + [0.0, 0.0, 0.0, 1.0], + ] +) +# fx == fy so a one-focal-length pixel offset is a 45-degree ray. +_INTR = CameraIntrinsics(fx=100.0, fy=100.0, cx=320.0, cy=240.0) + + +def _close(actual, expected, tol=1e-9): + return all(abs(a - e) <= tol for a, e in zip(actual, expected)) + + +def test_center_pixel_lands_directly_below_the_camera(): + point = pixel_to_ground_point((320.0, 240.0), _INTR, _LOOK_DOWN) + assert _close(point, (0.0, 0.0, 0.0)) + + +def test_offset_pixel_projects_with_correct_ground_offset(): + # One focal length right of center, camera 1 m up -> 45-degree ray -> 1 m. + point = pixel_to_ground_point((420.0, 240.0), _INTR, _LOOK_DOWN) + assert _close(point, (1.0, 0.0, 0.0)) + + +def test_ground_z_offset_raises_the_intersection_plane(): + # A non-zero ground_z lifts the intersection plane off the floor. + point = pixel_to_ground_point((320.0, 240.0), _INTR, _LOOK_DOWN, ground_z=0.11) + assert _close(point, (0.0, 0.0, 0.11)) + + +def test_camera_translation_offsets_the_ground_point(): + moved = _LOOK_DOWN.copy() + moved[:3, 3] = [2.0, 3.0, 1.0] + point = pixel_to_ground_point((320.0, 240.0), _INTR, moved) + assert _close(point, (2.0, 3.0, 0.0)) + + +def test_ray_pointing_up_returns_none(): + # Camera looking straight up: optical +z -> map +z, so the floor is behind. + look_up = np.eye(4) + look_up[2, 3] = 1.0 # 1 m above the floor + assert pixel_to_ground_point((320.0, 240.0), _INTR, look_up) is None + + +def test_horizontal_ray_parallel_to_floor_returns_none(): + # Optical +z -> map +x (look along the floor): no vertical component. + look_flat = np.array( + [ + [0.0, 0.0, 1.0, 0.0], + [0.0, -1.0, 0.0, 0.0], + [1.0, 0.0, 0.0, 1.0], + [0.0, 0.0, 0.0, 1.0], + ] + ) + assert pixel_to_ground_point((320.0, 240.0), _INTR, look_flat) is None + + +def test_from_k_reads_row_major_intrinsics(): + intr = CameraIntrinsics.from_k([100.0, 0.0, 320.0, 0.0, 110.0, 240.0, 0.0, 0.0, 1.0]) + assert (intr.fx, intr.fy, intr.cx, intr.cy) == (100.0, 110.0, 320.0, 240.0) + + +def test_from_k_rejects_wrong_length(): + with pytest.raises(ValueError, match="9 elements"): + CameraIntrinsics.from_k([100.0, 0.0, 320.0]) + + +def test_rejects_non_positive_focal_length(): + with pytest.raises(ValueError, match="focal lengths"): + CameraIntrinsics(fx=0.0, fy=100.0, cx=1.0, cy=1.0) + + +def test_rejects_non_4x4_transform(): + with pytest.raises(ValueError, match="4x4"): + pixel_to_ground_point((320.0, 240.0), _INTR, np.eye(3)) diff --git a/hackathon/pawtrack/tests/test_identify.py b/hackathon/pawtrack/tests/test_identify.py new file mode 100644 index 0000000000..5a086a9ea4 --- /dev/null +++ b/hackathon/pawtrack/tests/test_identify.py @@ -0,0 +1,89 @@ +"""Tests for the multi-target VLM identification prompt + parser.""" + +from pawtrack.identify import ( + build_prompt, + detect_all, + detect_facing, + parse_bboxes, + parse_facing, +) + + +def test_parses_array_of_bbox_objects(): + text = '[{"bbox": [1, 2, 3, 4]}, {"bbox": [5, 6, 7, 8]}]' + assert parse_bboxes(text) == [(1.0, 2.0, 3.0, 4.0), (5.0, 6.0, 7.0, 8.0)] + + +def test_parses_array_of_raw_boxes(): + assert parse_bboxes("[[1, 2, 3, 4]]") == [(1.0, 2.0, 3.0, 4.0)] + + +def test_parses_object_wrapping_a_list(): + assert parse_bboxes('{"objects": [{"bbox": [1, 2, 3, 4]}]}') == [ + (1.0, 2.0, 3.0, 4.0)] + + +def test_parses_a_single_bbox_object(): + # A model that answers with one detection object (not an array) must still + # be parsed -- the dict itself is the box. + assert parse_bboxes('{"bbox": [1, 2, 3, 4]}') == [(1.0, 2.0, 3.0, 4.0)] + assert parse_bboxes('{"name": "a person", "bbox": [1, 2, 3, 4]}') == [ + (1.0, 2.0, 3.0, 4.0)] + + +def test_strips_code_fence_and_prose(): + fenced = '```json\n[{"bbox": [1, 2, 3, 4]}]\n```' + assert parse_bboxes(fenced) == [(1.0, 2.0, 3.0, 4.0)] + prose = 'Here are the boxes: [{"bbox": [1, 2, 3, 4]}] -- done.' + assert parse_bboxes(prose) == [(1.0, 2.0, 3.0, 4.0)] + + +def test_drops_malformed_entries_keeps_valid(): + text = '[{"bbox": [1, 2, 3]}, {"bbox": [5, 6, 7, 8]}, "junk"]' + assert parse_bboxes(text) == [(5.0, 6.0, 7.0, 8.0)] + + +def test_empty_or_none_yields_empty_list(): + assert parse_bboxes("") == [] + assert parse_bboxes("no people here") == [] + assert parse_bboxes("[]") == [] + + +class _FakeVl: + def __init__(self, response): + self.response = response + self.prompt = None + + def query(self, _image, prompt): + self.prompt = prompt + return self.response + + +def test_detect_all_queries_with_description_and_parses(): + vl = _FakeVl('[{"bbox": [1, 2, 3, 4]}]') + boxes = detect_all(vl, object(), "a person sitting on a chair") + assert boxes == [(1.0, 2.0, 3.0, 4.0)] + assert "a person sitting on a chair" in vl.prompt + + +def test_build_prompt_includes_the_description(): + assert "a person in a red shirt" in build_prompt("a person in a red shirt") + + +def test_parse_facing_front_back_and_unknown(): + assert parse_facing("front") is True + assert parse_facing("Front.") is True + assert parse_facing("They are facing toward the camera") is True + assert parse_facing("back") is False + assert parse_facing("facing away") is False + assert parse_facing("") is None # empty + assert parse_facing("not sure") is None # neither cue + assert parse_facing("front or back, hard to tell") is None # both cues + + +def test_detect_facing_queries_and_parses(): + vl = _FakeVl("front") + assert detect_facing(vl, object(), "a chair") is True + assert "facing" in vl.prompt.lower() + assert "a chair" in vl.prompt # the prompt asks about the target object + assert detect_facing(_FakeVl("back"), object(), "a chair") is False diff --git a/hackathon/pawtrack/tests/test_motion_fallback.py b/hackathon/pawtrack/tests/test_motion_fallback.py new file mode 100644 index 0000000000..95e4b03080 --- /dev/null +++ b/hackathon/pawtrack/tests/test_motion_fallback.py @@ -0,0 +1,32 @@ +"""Tests for the pure frame-difference subject-motion fallback.""" + +import cv2 +import numpy as np + +from pawtrack.motion_fallback import detect_motion_bbox + + +def _frame(cx): + img = np.full((200, 320, 3), 200, np.uint8) + cv2.circle(img, (cx, 100), 25, (0, 0, 255), -1) + return img + + +def test_detects_a_moving_blob(): + bbox = detect_motion_bbox(_frame(80), _frame(220)) + assert bbox is not None + x1, y1, x2, y2 = bbox + # The moving region sits in the marker's vertical band and is marker-sized. + assert y1 < 100 < y2 + assert (x2 - x1) > 20 and (y2 - y1) > 20 + + +def test_no_motion_returns_none(): + frame = _frame(150) + assert detect_motion_bbox(frame, frame.copy()) is None + + +def test_mismatched_shapes_returns_none(): + a = np.zeros((10, 10, 3), np.uint8) + b = np.zeros((20, 20, 3), np.uint8) + assert detect_motion_bbox(a, b) is None diff --git a/hackathon/pawtrack/tests/test_no_dimos_import.py b/hackathon/pawtrack/tests/test_no_dimos_import.py new file mode 100644 index 0000000000..cc7606f537 --- /dev/null +++ b/hackathon/pawtrack/tests/test_no_dimos_import.py @@ -0,0 +1,42 @@ +"""Guard: the pure modules must not import DimOS. + +The pure logic (state machines, geometry, metrics) is the part that stays +testable without hardware or heavy models. Importing any of it must not drag in +the ``dimos`` package. This runs in a fresh subprocess because other tests in the +suite legitimately import DimOS, so ``dimos`` is already in this process's +``sys.modules`` by the time this test runs. +""" + +import os +import subprocess +import sys + +_PURE_MODULES = ( + "pawtrack.track_state", + "pawtrack.motion_fallback", + "pawtrack.ground_raycast", + "pawtrack.greeter_state", + "pawtrack.visited_registry", + "pawtrack.approach_geometry", + "pawtrack.identify", + "pawtrack.occupancy", +) + + +def test_pure_modules_do_not_import_dimos(): + code = ( + "import importlib, sys\n" + f"for m in {_PURE_MODULES!r}:\n" + " importlib.import_module(m)\n" + "leaked = sorted(k for k in sys.modules if k.split('.')[0] == 'dimos')\n" + "assert not leaked, leaked\n" + ) + env = dict(os.environ) + env["PYTHONPATH"] = os.path.abspath("src") + result = subprocess.run( + [sys.executable, "-c", code], + capture_output=True, + text=True, + env=env, + ) + assert result.returncode == 0, result.stderr diff --git a/hackathon/pawtrack/tests/test_occupancy.py b/hackathon/pawtrack/tests/test_occupancy.py new file mode 100644 index 0000000000..c9515c934d --- /dev/null +++ b/hackathon/pawtrack/tests/test_occupancy.py @@ -0,0 +1,89 @@ +"""Tests for the occupancy-grid obstacle filter (glass-reflection rejection).""" + +from pawtrack.occupancy import is_near_obstacle + + +def _free_grid(width, height): + """A width x height grid of free (cost 0) cells, indexed grid[y][x].""" + return [[0] * width for _ in range(height)] + + +def test_no_grid_keeps_candidate(): + # No costmap yet -> never suppress greeting (fail open). + assert is_near_obstacle( + None, 0.1, (0.0, 0.0), (1.0, 1.0), clearance_m=0.3) is False + + +def test_empty_grid_keeps_candidate(): + assert is_near_obstacle( + [], 0.1, (0.0, 0.0), (1.0, 1.0), clearance_m=0.3) is False + + +def test_free_cell_keeps_candidate(): + grid = _free_grid(10, 10) + assert is_near_obstacle( + grid, 0.1, (0.0, 0.0), (0.5, 0.5), clearance_m=0.3) is False + + +def test_unknown_cells_do_not_reject(): + # Unknown space (-1) is not a known wall, so a candidate there is kept. + grid = [[-1] * 10 for _ in range(10)] + assert is_near_obstacle( + grid, 0.1, (0.0, 0.0), (0.5, 0.5), clearance_m=0.3) is False + + +def test_point_on_occupied_cell_is_rejected(): + grid = _free_grid(10, 10) + grid[5][5] = 100 # lethal wall cell at grid[y=5][x=5] + # world (0.55, 0.55) -> cell (5, 5) + assert is_near_obstacle( + grid, 0.1, (0.0, 0.0), (0.55, 0.55), clearance_m=0.3) is True + + +def test_occupied_cell_within_clearance_is_rejected(): + grid = _free_grid(20, 20) + grid[10][10] = 100 + # ranging noise: the point lands ~0.2 m from the wall, within the 0.3 m + # clearance, so it is still rejected. + assert is_near_obstacle( + grid, 0.1, (0.0, 0.0), (1.25, 1.05), clearance_m=0.3) is True + + +def test_occupied_cell_beyond_clearance_is_kept(): + grid = _free_grid(40, 40) + grid[10][10] = 100 + # ~0.5 m from the wall, beyond the 0.3 m clearance -> a real subject. + assert is_near_obstacle( + grid, 0.1, (0.0, 0.0), (1.55, 1.05), clearance_m=0.3) is False + + +def test_point_off_grid_is_kept(): + grid = _free_grid(10, 10) + grid[5][5] = 100 + assert is_near_obstacle( + grid, 0.1, (0.0, 0.0), (50.0, 50.0), clearance_m=0.3) is False + + +def test_cost_threshold_respected(): + grid = _free_grid(10, 10) + grid[5][5] = 40 # inflation cost, below the default wall threshold of 50 + assert is_near_obstacle( + grid, 0.1, (0.0, 0.0), (0.55, 0.55), clearance_m=0.3) is False + assert is_near_obstacle( + grid, 0.1, (0.0, 0.0), (0.55, 0.55), clearance_m=0.3, + cost_threshold=40) is True + + +def test_origin_offset_applied(): + grid = _free_grid(10, 10) + grid[0][0] = 100 + # origin shifted to (-1, -1): world (-0.95, -0.95) maps to cell (0, 0). + assert is_near_obstacle( + grid, 0.1, (-1.0, -1.0), (-0.95, -0.95), clearance_m=0.1) is True + + +def test_zero_resolution_keeps_candidate(): + grid = _free_grid(10, 10) + grid[5][5] = 100 + assert is_near_obstacle( + grid, 0.0, (0.0, 0.0), (0.5, 0.5), clearance_m=0.3) is False diff --git a/hackathon/pawtrack/tests/test_skill_container.py b/hackathon/pawtrack/tests/test_skill_container.py new file mode 100644 index 0000000000..3f454aecc7 --- /dev/null +++ b/hackathon/pawtrack/tests/test_skill_container.py @@ -0,0 +1,255 @@ +"""Container-level tests for the PawTrack subject-tracking skill glue. + +Exercise the DimOS ``PawTrackSkillContainer`` orchestration -- monitor state +reset, malformed input, status publishing, and the ground-pose raycast -- with +fakes for the EdgeTAM tracker and the output streams, so no GPU, robot, or +network is needed. +""" + +import json + +import numpy as np +import pytest + +from pawtrack.ground_raycast import CameraIntrinsics +from pawtrack.skill_container import PawTrackSkillContainer +from pawtrack.track_state import TrackMetrics + + +class _Recorder: + """Stand-in for an ``Out`` stream that records published messages.""" + + def __init__(self): + self.msgs = [] + + def publish(self, msg): + self.msgs.append(msg) + + +class _FakeImage: + """Minimal ``Image`` stand-in: size for bbox math, a frame for overlays.""" + + width = 640 + height = 480 + ts = 0.0 + + def to_opencv(self): + return np.zeros((self.height, self.width, 3), dtype=np.uint8) + + +class _FakeTracker: + """EdgeTAM stand-in; ``init_count`` controls init success vs failure.""" + + def __init__(self, init_count=0): + self._init_count = init_count + + def init_track(self, **_kwargs): + return list(range(self._init_count)) + + def process_image(self, _image): + return [] # always a miss, so the monitor loop just coasts + + def stop(self): + pass + + +class _FakeTransform: + """Transform stand-in exposing the matrix API used by the raycast.""" + + def __init__(self, matrix): + self._matrix = matrix + + def to_matrix(self): + return self._matrix + + +class _FakeTf: + """Frame lookup table for pose-publishing tests.""" + + def __init__(self, transforms): + self._transforms = transforms + + def get(self, frame_id, _child_frame_id, **_kwargs): + return self._transforms.get(frame_id) + + +def _look_down_transform(x=0.0, y=0.0, z=1.0): + transform = np.array( + [ + [1.0, 0.0, 0.0, x], + [0.0, -1.0, 0.0, y], + [0.0, 0.0, -1.0, z], + [0.0, 0.0, 0.0, 1.0], + ] + ) + return _FakeTransform(transform) + + +# A bbox whose bottom-center -- the ground-contact pixel -- is (320, 240). +_CONTACT_BBOX = (300.0, 200.0, 340.0, 240.0) + + +def _container(init_count=0): + container = PawTrackSkillContainer() + container.subject_status = _Recorder() + container.debug_image = _Recorder() + container.subject_world_pose = _Recorder() + container.subject_map_pose = _Recorder() + container._latest_image = _FakeImage() + container._tracker = _FakeTracker(init_count) + return container + + +def _statuses(container): + return [json.loads(msg)["status"] for msg in container.subject_status.msgs] + + +def _metrics(): + return TrackMetrics( + bbox=(10.0, 10.0, 30.0, 30.0), center_px=(20.0, 20.0), + width_px=20.0, height_px=20.0, area_px=400.0, area_ratio=0.001, + image_error_x=0.0, image_error_y=0.0, confidence=0.9, + mask_area_px=320.0, mask_area_ratio=0.001, + ) + + +# -- Perception -- + +def test_track_subject_rejects_malformed_initial_bbox(): + container = _container() + result = container.track_subject("a person", initial_bbox=[1, 2]) + assert "initial_bbox" in result + assert container.subject_status.msgs == [] # nothing published + assert container._snapshot.status == "idle" # state left untouched + + +def test_track_subject_resets_prior_state_and_publishes_acquiring(): + container = _container(init_count=0) # tracker init fails -> no loop thread + # A prior track left a baseline behind. + container._state.observe("a prior subject", _metrics(), 1.0) + assert container._state._last_tracked is not None + + result = container.track_subject( + "a person sitting on a chair", initial_bbox=[10.0, 10.0, 60.0, 60.0]) + + assert "Could not initialize" in result + assert _statuses(container) == ["acquiring", "lost"] + first = json.loads(container.subject_status.msgs[0]) + assert first["description"] == "a person sitting on a chair" + # The old target's baseline was cleared, so it cannot carry forward. + assert container._state._last_tracked is None + + +def test_track_subject_defaults_to_a_seated_person(): + container = _container(init_count=0) + container.track_subject(initial_bbox=[10.0, 10.0, 60.0, 60.0]) + first = json.loads(container.subject_status.msgs[0]) + assert first["description"] == "a person sitting on a chair" + + +def test_status_json_is_well_formed(): + container = _container(init_count=0) + container.track_subject( + "a person sitting on a chair", initial_bbox=[10.0, 10.0, 60.0, 60.0]) + assert container.subject_status.msgs + for msg in container.subject_status.msgs: + data = json.loads(msg) + assert "seen_at" not in data + assert "last_seen_age_s" in data + for key in ("status", "description", "message"): + assert key in data + + +def test_start_then_stop_publishes_stopped(): + container = _container(init_count=1) # tracker init succeeds -> loop starts + result = container.track_subject( + "a person sitting on a chair", initial_bbox=[10.0, 10.0, 60.0, 60.0]) + assert result.startswith("Started tracking") + + stopped = container.stop_tracking() + + assert stopped.startswith("Stopped") + assert container._snapshot.status == "stopped" + statuses = _statuses(container) + assert "acquiring" in statuses and "stopped" in statuses + assert container._thread is None or not container._thread.is_alive() + + +def test_ground_pose_publishes_world_and_optional_map(): + container = _container() + container._intrinsics = CameraIntrinsics( + fx=100.0, fy=100.0, cx=320.0, cy=240.0) + container._tf = _FakeTf({ + "world": _look_down_transform(x=1.0), + "map": _look_down_transform(x=10.0), + }) + + container._publish_ground_poses(_FakeImage(), _CONTACT_BBOX) + + assert len(container.subject_world_pose.msgs) == 1 + assert len(container.subject_map_pose.msgs) == 1 + assert container.subject_world_pose.msgs[0].frame_id == "world" + assert container.subject_map_pose.msgs[0].frame_id == "map" + # The contact pixel rests on the floor, so z is the floor height (0). + assert tuple( + container.subject_world_pose.msgs[0].position) == pytest.approx( + (1.0, 0.0, 0.0)) + assert tuple( + container.subject_map_pose.msgs[0].position) == pytest.approx( + (10.0, 0.0, 0.0)) + + +def test_ground_pose_keeps_world_when_map_tf_missing(): + container = _container() + container._intrinsics = CameraIntrinsics( + fx=100.0, fy=100.0, cx=320.0, cy=240.0) + container._tf = _FakeTf({"world": _look_down_transform()}) + + container._publish_ground_poses(_FakeImage(), _CONTACT_BBOX) + + assert len(container.subject_world_pose.msgs) == 1 + assert container.subject_world_pose.msgs[0].frame_id == "world" + assert container.subject_map_pose.msgs == [] + + +def test_absent_map_tf_lookup_is_throttled_not_per_frame(): + # tf.get logs on every miss, so a missing map frame must not be probed each + # tracked frame. World is present (probed every frame); map is absent + # (probed once, then throttled). + container = _container() + container._intrinsics = CameraIntrinsics( + fx=100.0, fy=100.0, cx=320.0, cy=240.0) + calls = {} + + class _CountingTf: + def get(self, frame_id, _child, **_kwargs): + calls[frame_id] = calls.get(frame_id, 0) + 1 + return _look_down_transform() if frame_id == "world" else None + + container._tf = _CountingTf() + for _ in range(5): + container._publish_ground_poses(_FakeImage(), _CONTACT_BBOX) + + assert calls["world"] == 5 # present -> probed every frame + assert calls["map"] == 1 # absent -> one probe, then throttled + assert container.subject_map_pose.msgs == [] + + +def test_map_lookup_uses_looser_tolerance_than_world(): + # The map<-world TF is republished slowly, so the map lookup must tolerate + # a staler transform than the live odom chain, or it pauses mid-interval. + container = _container() + container._intrinsics = CameraIntrinsics( + fx=100.0, fy=100.0, cx=320.0, cy=240.0) + seen = {} + + class _CaptureTf: + def get(self, frame_id, _child, *, time_point=None, time_tolerance=None): + seen[frame_id] = time_tolerance + return _look_down_transform() + + container._tf = _CaptureTf() + container._publish_ground_poses(_FakeImage(), _CONTACT_BBOX) + + assert seen["map"] > seen["world"] + assert seen["map"] >= 2.0 # must exceed the relocalization publish interval diff --git a/hackathon/pawtrack/tests/test_track_state.py b/hackathon/pawtrack/tests/test_track_state.py new file mode 100644 index 0000000000..e3f5e4f000 --- /dev/null +++ b/hackathon/pawtrack/tests/test_track_state.py @@ -0,0 +1,233 @@ +"""Tests for the pure subject-tracking module: metrics + state machine.""" + +import json +import math + +from pawtrack.track_state import ( + MonitorParams, + MonitorState, + TrackMetrics, + TrackSnapshot, + VisualObservation, + bbox_in_image, + clamp_bbox, + ground_contact_pixel, + is_bbox_shape, + valid_bbox, + visual_metrics, +) + + +def _metrics(center=(20.0, 20.0), width=20.0, area=400.0): + cx, cy = center + half = width / 2.0 + return TrackMetrics( + bbox=(cx - half, cy - half, cx + half, cy + half), + center_px=(cx, cy), width_px=width, height_px=width, + area_px=area, area_ratio=area / (640 * 480), + image_error_x=0.0, image_error_y=0.0, confidence=0.9, + mask_area_px=area * 0.8, mask_area_ratio=0.0, + ) + + +def test_visual_metrics_reports_size_and_center_error(): + metrics = visual_metrics( + VisualObservation( + bbox=(40.0, 20.0, 80.0, 60.0), + image_width=200, + image_height=100, + confidence=0.8, + mask_area_px=1200.0, + ) + ) + + assert metrics.center_px == (60.0, 40.0) + assert metrics.width_px == 40.0 + assert metrics.height_px == 40.0 + assert metrics.area_px == 1600.0 + assert metrics.area_ratio == 0.08 + assert metrics.image_error_x == -0.4 + assert metrics.image_error_y == -0.2 + assert metrics.mask_area_ratio == 0.06 + + +def test_visual_metrics_rejects_invalid_bbox(): + observation = VisualObservation( + bbox=(10.0, 10.0, 10.0, 20.0), + image_width=100, + image_height=100, + ) + + try: + visual_metrics(observation) + except ValueError as err: + assert "bbox" in str(err) + else: + raise AssertionError("invalid bbox should raise") + + +def test_ground_contact_pixel_is_bbox_bottom_center(): + # The subject meets the floor at the bottom-center of its box, not the + # center -- a seated person's box center is torso/seat height. + assert ground_contact_pixel((100.0, 40.0, 140.0, 200.0)) == (120.0, 200.0) + + +def test_is_bbox_shape_accepts_four_finite_numbers(): + assert is_bbox_shape([10.0, 20.0, 30.0, 40.0]) + assert is_bbox_shape((10, 20, 30, 40)) + + +def test_is_bbox_shape_rejects_malformed(): + assert not is_bbox_shape(None) + assert not is_bbox_shape([1, 2]) # too short -- would crash on index 2/3 + assert not is_bbox_shape([1, 2, 3, 4, 5]) # too long + assert not is_bbox_shape(["a", "b", "c", "d"]) # non-numeric + assert not is_bbox_shape([1.0, 2.0, math.inf, 4.0]) # not finite + + +def test_bbox_in_image_rejects_degenerate_and_off_frame(): + assert bbox_in_image((10.0, 10.0, 50.0, 50.0), 100, 100) + assert not bbox_in_image((50.0, 10.0, 50.0, 50.0), 100, 100) # zero width + assert not bbox_in_image((-50.0, -50.0, -10.0, -10.0), 100, 100) # off + assert bbox_in_image((-20.0, -20.0, 30.0, 30.0), 100, 100) # partial + + +def test_valid_bbox_combines_shape_and_overlap(): + assert valid_bbox([10.0, 10.0, 50.0, 50.0], 100, 100) + assert not valid_bbox([1, 2], 100, 100) # bad shape, no crash + assert not valid_bbox([200.0, 200.0, 300.0, 300.0], 100, 100) # off-frame + + +def test_clamp_bbox_keeps_in_bounds_and_clamps_overflow(): + assert clamp_bbox((10.0, 10.0, 50.0, 50.0), 100, 100) == ( + 10.0, 10.0, 50.0, 50.0) + # A box spilling past two edges clamps to the visible portion, staying + # non-degenerate (x2 > x1, y2 > y1) so EdgeTAM init gets a usable box. + clamped = clamp_bbox((-20.0, -20.0, 130.0, 130.0), 100, 100) + assert clamped == (0.0, 0.0, 100.0, 100.0) + edge = clamp_bbox((90.0, 90.0, 130.0, 130.0), 100, 100) + assert edge == (90.0, 90.0, 100.0, 100.0) + assert edge[2] > edge[0] and edge[3] > edge[1] + + +def test_begin_is_acquiring(): + snap = MonitorState().begin("a person sitting on a chair") + assert snap.status == "acquiring" + assert snap.description == "a person sitting on a chair" + + +def test_tracks_then_lost_retains_last_fix(): + state = MonitorState() + state.begin("person") + snap, action = state.observe( + "person", _metrics(center=(20.0, 20.0)), 100.0) + assert snap.status == "tracking" and action == "track" + lost, action = state.observe("person", None, 100.5) + assert lost.status == "lost" and action == "coast" + assert lost.center_px == (20.0, 20.0) and lost.seen_at == 100.0 + + +def test_lost_with_no_prior_track_has_no_bbox(): + state = MonitorState() + state.begin("person") + lost, _ = state.observe("person", None, 0.0) + assert lost.status == "lost" and lost.bbox is None + + +def test_drift_gate_rejects_a_huge_center_jump(): + state = MonitorState() + state.begin("person") + state.observe("person", _metrics(center=(100.0, 100.0), width=20.0), 0.0) + snap, action = state.observe( + "person", _metrics(center=(400.0, 400.0), width=20.0), 0.1) + # ~424px jump >> 2.5 * 20 -> rejected as drift, treated as a miss. + assert snap.status == "lost" and action == "coast" + assert snap.center_px == (100.0, 100.0) # kept the last good fix + + +def test_drift_gate_rejects_area_explosion(): + state = MonitorState() + state.begin("person") + state.observe("person", _metrics(center=(100.0, 100.0), area=400.0), 0.0) + snap, _ = state.observe( + "person", _metrics(center=(100.0, 100.0), area=4000.0), 0.1) + assert snap.status == "lost" # 10x area jump rejected + + +def test_drift_gate_allows_normal_motion(): + state = MonitorState() + state.begin("person") + state.observe("person", _metrics(center=(100.0, 100.0), width=20.0), 0.0) + snap, action = state.observe( + "person", _metrics(center=(120.0, 100.0), width=20.0), 0.1) + assert snap.status == "tracking" and action == "track" + assert snap.center_px == (120.0, 100.0) + + +def test_marks_stale_after_timeout(): + state = MonitorState( + MonitorParams(stale_timeout_s=1.0, max_lost_frames=999)) + state.begin("person") + state.observe("person", _metrics(), 0.0) + fresh, _ = state.observe("person", None, 0.5) + assert fresh.status == "lost" # still within the timeout + stale, _ = state.observe("person", None, 2.0) + assert stale.status == "stale" # aged out + assert stale.center_px == (20.0, 20.0) # still carries the last fix + + +def test_reacquired_skips_drift_gate(): + state = MonitorState() + state.begin("person") + state.observe("person", _metrics(center=(50.0, 50.0)), 0.0) + state.reacquired() # VLM/motion found it, possibly far away + snap, action = state.observe( + "person", _metrics(center=(500.0, 400.0)), 0.2) + assert snap.status == "tracking" and action == "track" + assert snap.center_px == (500.0, 400.0) # far jump accepted post-reacquire + + +def test_reacquire_cadence_then_give_up(): + state = MonitorState(MonitorParams( + max_lost_frames=3, reacquire_interval_frames=2, + max_reacquire_attempts=2, stale_timeout_s=1e9, + )) + state.observe("person", _metrics(), 0.0) # establish baseline + actions = [state.observe("person", None, 0.0)[1] for _ in range(7)] + assert actions == [ + "coast", "coast", "reacquire", "coast", "reacquire", "coast", "give_up" + ] + + +def test_reacquired_resets_lost_counter(): + state = MonitorState(MonitorParams( + max_lost_frames=3, reacquire_interval_frames=2, stale_timeout_s=1e9)) + state.observe("person", _metrics(), 0.0) + for _ in range(3): + state.observe("person", None, 0.0) # 3rd is reacquire-due + state.reacquired() + assert state.observe("person", None, 0.0)[1] == "coast" + + +def test_stopped_and_errored(): + state = MonitorState() + state.observe("person", _metrics(), 5.0) + err = state.errored("person", "Monitor loop error; see logs.") + assert err.status == "error" and err.center_px == (20.0, 20.0) + assert err.message == "Monitor loop error; see logs." + stopped = state.stopped() + assert stopped.status == "stopped" and stopped.bbox is None + + +def test_json_schema_drops_seen_at_adds_age(): + snap = TrackSnapshot( + status="tracking", description="person", message="Tracking person.", + seen_at=10.0, bbox=(1.0, 2.0, 3.0, 4.0), + ) + data = json.loads(snap.to_json(now=12.5)) + assert "seen_at" not in data + assert data["last_seen_age_s"] == 2.5 + assert data["status"] == "tracking" + assert data["bbox"] == [1.0, 2.0, 3.0, 4.0] + idle = TrackSnapshot(status="idle", description=None, message="-") + assert json.loads(idle.to_json())["last_seen_age_s"] is None diff --git a/hackathon/pawtrack/tests/test_visited_registry.py b/hackathon/pawtrack/tests/test_visited_registry.py new file mode 100644 index 0000000000..def332e17c --- /dev/null +++ b/hackathon/pawtrack/tests/test_visited_registry.py @@ -0,0 +1,103 @@ +"""Tests for the pure visited-subject memory and target selection.""" + +import random + +import pytest + +from pawtrack.visited_registry import ( + Candidate, + VisitedRegistry, + select_target, +) + + +def _cand(x, y, bbox=(0.0, 0.0, 10.0, 10.0)): + return Candidate(position=(x, y), bbox=bbox) + + +def test_marks_and_detects_visited_within_radius(): + registry = VisitedRegistry(revisit_radius_m=1.0) + assert not registry.is_visited((0.0, 0.0)) + registry.mark_visited((0.0, 0.0)) + assert registry.is_visited((0.5, 0.0)) # within the radius -> same subject + assert not registry.is_visited((2.0, 0.0)) # a different subject + + +def test_unvisited_filters_already_greeted(): + registry = VisitedRegistry(revisit_radius_m=1.0) + registry.mark_visited((0.0, 0.0)) + candidates = [_cand(0.2, 0.0), _cand(3.0, 0.0)] + assert registry.unvisited(candidates) == [candidates[1]] + + +def test_select_target_never_picks_a_visited_subject(): + registry = VisitedRegistry(revisit_radius_m=1.0) + registry.mark_visited((0.0, 0.0)) + candidates = [_cand(0.1, 0.0), _cand(5.0, 0.0), _cand(6.0, 0.0)] + for seed in range(20): + chosen = select_target(candidates, registry, random.Random(seed)) + assert chosen in candidates[1:] # never the greeted one + + +def test_select_target_none_when_all_visited(): + registry = VisitedRegistry(revisit_radius_m=1.0) + registry.mark_visited((0.0, 0.0)) + registry.mark_visited((5.0, 0.0)) + candidates = [_cand(0.1, 0.0), _cand(5.1, 0.0)] + assert select_target(candidates, registry, random.Random(0)) is None + + +def test_select_target_none_when_no_candidates(): + assert select_target([], VisitedRegistry(1.0), random.Random(0)) is None + + +def test_select_target_is_random_yet_reproducible(): + candidates = [_cand(float(i), 0.0) for i in range(10)] + a = select_target(candidates, VisitedRegistry(0.1), random.Random(3)) + b = select_target(candidates, VisitedRegistry(0.1), random.Random(3)) + assert a == b # same seed -> same pick + + picks = { + select_target( + candidates, VisitedRegistry(0.1), random.Random(seed) + ).position + for seed in range(20) + } + assert len(picks) > 1 # actually varies across seeds + + +def test_revisit_radius_must_be_positive(): + with pytest.raises(ValueError): + VisitedRegistry(0.0) + + +def test_forget_after_s_must_be_positive_or_none(): + VisitedRegistry(1.0, forget_after_s=None) # no forget -> ok + VisitedRegistry(1.0, forget_after_s=30.0) # ok + with pytest.raises(ValueError): + VisitedRegistry(1.0, forget_after_s=0.0) + + +def test_greeting_is_forgotten_after_the_forget_window(): + registry = VisitedRegistry(revisit_radius_m=1.0, forget_after_s=60.0) + registry.mark_visited((0.0, 0.0), now=10.0) + assert registry.is_visited((0.2, 0.0), now=40.0) # within the window + assert registry.is_visited((0.2, 0.0), now=70.0) # exactly at the window + assert not registry.is_visited((0.2, 0.0), now=71.0) # forgotten + + +def test_select_target_repicks_a_subject_after_it_is_forgotten(): + registry = VisitedRegistry(revisit_radius_m=1.0, forget_after_s=60.0) + candidates = [_cand(0.0, 0.0)] + registry.mark_visited((0.0, 0.0), now=0.0) + # Still remembered -> skipped. + assert select_target(candidates, registry, random.Random(0), now=30.0) is None + # Forgotten -> greet-eligible again. + chosen = select_target(candidates, registry, random.Random(0), now=90.0) + assert chosen == candidates[0] + + +def test_none_forget_never_expires(): + registry = VisitedRegistry(revisit_radius_m=1.0, forget_after_s=None) + registry.mark_visited((0.0, 0.0), now=0.0) + assert registry.is_visited((0.0, 0.0), now=1_000_000.0) # never forgotten