diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/__init__.py b/dimos/robot/unitree/go2/blueprints/hackathon/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/blueprint.py b/dimos/robot/unitree/go2/blueprints/hackathon/blueprint.py new file mode 100644 index 0000000000..a69e8457b3 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/hackathon/blueprint.py @@ -0,0 +1,50 @@ +"""unitree-go2-hackathon blueprint. + +Build chain (avoids SecurityModule which requires CUDA/EdgeTAM): + unitree_go2_basic → unitree_go2 → +SpatialMemory +PerceiveLoopSkill + (SecurityModule intentionally excluded) +Then adds: McpServer + hackathon skill containers. +""" + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.agents.skills.navigation import NavigationSkillContainer +from dimos.agents.skills.speak_skill import SpeakSkill +from dimos.agents.web_human_input import WebInput +from dimos.core.coordination.blueprints import autoconnect +from dimos.perception.perceive_loop_skill import PerceiveLoopSkill +from dimos.perception.spatial_perception import SpatialMemory +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2 import unitree_go2 +from dimos.robot.unitree.go2.connection import GO2Connection +from dimos.robot.unitree.unitree_skill_container import UnitreeSkillContainer +from dimos.robot.unitree.go2.blueprints.hackathon.dog_mode import DogModeModule +from dimos.robot.unitree.go2.blueprints.hackathon.find import FindSkillContainer +from dimos.robot.unitree.go2.blueprints.hackathon.perception_loop import PerceptionLoopModule +from dimos.robot.unitree.go2.blueprints.hackathon.smart_follow import SmartFollowSkillContainer + +# Spatial layer without SecurityModule (no CUDA required) +_unitree_go2_spatial_no_security = autoconnect( + unitree_go2, + SpatialMemory.blueprint(), + PerceiveLoopSkill.blueprint(), +).global_config(n_workers=8) + +_hackathon_agentic = autoconnect( + NavigationSkillContainer.blueprint(), + PerceptionLoopModule.blueprint(camera_info=GO2Connection.camera_info_static), + SmartFollowSkillContainer.blueprint(camera_info=GO2Connection.camera_info_static), + FindSkillContainer.blueprint(), + DogModeModule.blueprint(camera_info=GO2Connection.camera_info_static), + UnitreeSkillContainer.blueprint(), + WebInput.blueprint(), + SpeakSkill.blueprint(), +) + +unitree_go2_hackathon = autoconnect( + _unitree_go2_spatial_no_security, + McpServer.blueprint(), + McpClient.blueprint(), + _hackathon_agentic, +) + +__all__ = ["unitree_go2_hackathon"] diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/dog_mode.py b/dimos/robot/unitree/go2/blueprints/hackathon/dog_mode.py new file mode 100644 index 0000000000..d66135f075 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/hackathon/dog_mode.py @@ -0,0 +1,353 @@ +"""DogModeModule — autonomous dog behavior for Unitree Go2. + +State machine: + WANDERING — explore environment, periodic sniff animation + ALERT — person detected, face them, bark randomly based on threat level + CLOSE — person within growl range, track and growl, stop moving + +Threat levels (assigned per track_id on first detection, random): + 0.0–0.4 curious: occasional soft bark, no growl + 0.4–0.7 cautious: bark after short delay, growl at 2.0m + 0.7–1.0 aggressive: bark immediately, growl at 2.5m +""" + +from __future__ import annotations + +import random +import time +from pathlib import Path +from threading import Event, RLock, Thread +from typing import Any + +from reactivex.disposable import Disposable +from unitree_webrtc_connect.constants import RTC_TOPIC, SPORT_CMD + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.msgs.geometry_msgs.Twist import Twist +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.navigation.visual_servoing.visual_servoing_2d import VisualServoing2D +from dimos.perception.detection.detectors.person.yolo import YoloPersonDetector +from dimos.robot.unitree.go2.connection_spec import GO2ConnectionSpec +from dimos.utils.logging_config import setup_logger +from dimos_lcm.std_msgs import Bool +from dimos.robot.unitree.go2.blueprints.hackathon.frame_writer import write_annotated_frame, write_state +from dimos.robot.unitree.go2.blueprints.hackathon.perception_loop import get_shared_detections, set_active_mode, shared_detections_fresh + +logger = setup_logger() + +SOUNDS_DIR = Path(__file__).parent / "sounds" +SOUND_FILES = { + "bark": SOUNDS_DIR / "minecraft-dog-bark.mp3", + "deepbark": SOUNDS_DIR / "deepbark.mp3", + "growl": SOUNDS_DIR / "dog-growling-and-barking.mp3", + "perry": SOUNDS_DIR / "perry-the-platypuss-growl.mp3", +} + +# Threat thresholds +_AGGRESSIVE_THRESHOLD = 0.7 +_CAUTIOUS_THRESHOLD = 0.4 + +# Distance thresholds (meters, estimated from bbox width) +_CLOSE_AGGRESSIVE = 2.5 # growl distance for aggressive +_CLOSE_CAUTIOUS = 2.0 # growl distance for cautious +_FACE_RANGE = 5.0 # start facing at this distance + +# Timing +_SNIFF_INTERVAL_MIN = 20.0 # minimum seconds between sniffs +_SNIFF_INTERVAL_MAX = 45.0 +_BARK_COOLDOWN = 4.0 # minimum seconds between barks +_LOOP_HZ = 10.0 # main loop frequency + + +def _play_sound(name: str) -> None: + path = SOUND_FILES.get(name) + if path and path.exists(): + try: + import sounddevice as sd + import soundfile as sf + data, sr = sf.read(str(path)) + sd.play(data.astype("float32"), sr) + sd.wait() + except Exception as e: + logger.warning(f"Sound playback failed: {e}") + + +class Config(ModuleConfig): + camera_info: CameraInfo + + +class DogModeModule(Module): + """Autonomous dog behavior module.""" + + config: Config + + color_image: In[Image] + cmd_vel: Out[Twist] + explore_cmd: Out[Bool] + stop_explore_cmd: Out[Bool] + + _connection: GO2ConnectionSpec # injected by dimOS — used for sport commands + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._detector = YoloPersonDetector() # fallback only + self._servo = VisualServoing2D(self.config.camera_info) + + self._latest_image: Image | None = None + self._thread: Thread | None = None + self._should_stop = Event() + self._lock = RLock() + + # Threat memory: track_id → threat float [0,1] + self._threats: dict[int, float] = {} + self._state = "IDLE" + + @rpc + def start(self) -> None: + super().start() + self.register_disposable(Disposable(self.color_image.subscribe(self._on_image))) + + @rpc + def stop(self) -> None: + self._should_stop.set() + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._thread = None + self._detector.stop() + super().stop() + + def _on_image(self, image: Image) -> None: + with self._lock: + self._latest_image = image + + @skill + def start_dog_mode(self) -> str: + """Start autonomous dog mode. + + The robot will: + - Roam and explore the environment + - Periodically sniff the ground (StandDown animation) + - Detect people and assess threat level (random, per person) + - Bark at people based on threat level + - Growl and track anyone who gets close + - Face people when they move toward it + """ + if self._thread and self._thread.is_alive(): + return "Dog mode is already running." + + self._should_stop.clear() + self._threats.clear() + self._state = "WANDERING" + set_active_mode("dog") + + self._thread = Thread(target=self._dog_loop, daemon=True, name="DogMode") + self._thread.start() + + return "Woof. Dog mode activated. I'll explore, sniff around, and keep an eye on people." + + @skill + def stop_dog_mode(self) -> str: + """Stop dog mode and return the robot to idle.""" + self._should_stop.set() + set_active_mode(None) + self._send_stop_explore() + self.cmd_vel.publish(Twist.zero()) + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._thread = None + self._state = "IDLE" + return "Dog mode stopped." + + # ────────────────────────────────────────────── + # Internal helpers + # ────────────────────────────────────────────── + + def _sport(self, name: str) -> None: + try: + cmd_id = SPORT_CMD[name] + self._connection.publish_request(RTC_TOPIC["SPORT_MOD"], {"api_id": cmd_id}) + except Exception as e: + logger.warning(f"Sport command '{name}' failed: {e}") + + def _send_explore(self) -> None: + msg = Bool(); msg.data = True + self.explore_cmd.publish(msg) + + def _send_stop_explore(self) -> None: + msg = Bool(); msg.data = True + self.stop_explore_cmd.publish(msg) + + def _threat_for(self, track_id: int) -> float: + if track_id not in self._threats: + self._threats[track_id] = random.random() + logger.info(f"DogMode: new person track_id={track_id} threat={self._threats[track_id]:.2f}") + return self._threats[track_id] + + def _growl_distance(self, threat: float) -> float: + if threat >= _AGGRESSIVE_THRESHOLD: + return _CLOSE_AGGRESSIVE + if threat >= _CAUTIOUS_THRESHOLD: + return _CLOSE_CAUTIOUS + return 0.0 # curious — no growl + + def _face_person(self, bbox: tuple, image_width: int) -> None: + """Publish angular-only twist to face person (no linear movement).""" + x1, _, x2, _ = bbox + cx = (x1 + x2) / 2.0 + fx = self.config.camera_info.K[0] + optical_cx = self.config.camera_info.K[2] + x_norm = (cx - optical_cx) / fx + angular_z = float(-x_norm * 1.2) + angular_z = max(-0.6, min(0.6, angular_z)) + if abs(angular_z) > 0.05: + self.cmd_vel.publish( + Twist(linear=Vector3(0.0, 0.0, 0.0), angular=Vector3(0.0, 0.0, angular_z)) + ) + else: + self.cmd_vel.publish(Twist.zero()) + + def _do_sniff(self) -> None: + """StandDown → pause → RecoveryStand in a background thread.""" + def _sniff() -> None: + self.cmd_vel.publish(Twist.zero()) + self._sport("StandDown") + time.sleep(2.0) + self._sport("RecoveryStand") + time.sleep(1.0) + Thread(target=_sniff, daemon=True).start() + + # ────────────────────────────────────────────── + # Main loop + # ────────────────────────────────────────────── + + def _dog_loop(self) -> None: + last_sniff = time.monotonic() - random.uniform(0, _SNIFF_INTERVAL_MIN) + last_bark = 0.0 + next_sniff_in = random.uniform(_SNIFF_INTERVAL_MIN, _SNIFF_INTERVAL_MAX) + sniffing = False + frame_count = 0 + + # Start exploring + self._send_explore() + + period = 1.0 / _LOOP_HZ + next_t = time.monotonic() + + while not self._should_stop.is_set(): + next_t += period + now = time.monotonic() + + with self._lock: + image = self._latest_image + + if image is None: + time.sleep(period) + continue + + frame_count += 1 + + # ── Detect people — use shared YOLO pass when fresh ── + if shared_detections_fresh(): + all_dets, _ = get_shared_detections() + persons = [d for d in all_dets if getattr(d, "name", "") == "person"] + else: + dets = self._detector.process_image(image) + persons = [d for d in dets.detections if d.is_valid()] + + if not persons: + # Nobody around — wander + if self._state != "WANDERING": + self._state = "WANDERING" + self._send_explore() + + # Sniff periodically + if not sniffing and (now - last_sniff) >= next_sniff_in: + sniffing = True + last_sniff = now + next_sniff_in = random.uniform(_SNIFF_INTERVAL_MIN, _SNIFF_INTERVAL_MAX) + self._do_sniff() + self._should_stop.wait(timeout=3.2) + sniffing = False + + else: + # ── Person(s) detected ── + # Pick the most prominent (nearest) + target = max(persons, key=lambda d: d.bbox_2d_volume()) + tid = target.track_id + threat = self._threat_for(tid) + dist = self._servo._estimate_distance(target.bbox) + growl_dist = self._growl_distance(threat) + + # Stop exploration while attending to person + if self._state == "WANDERING": + self._send_stop_explore() + self._state = "ALERT" + + is_close = dist is not None and growl_dist > 0 and dist <= growl_dist + + if is_close: + # ── CLOSE: growl and face, stop moving ── + if self._state != "CLOSE": + self._state = "CLOSE" + Thread(target=_play_sound, args=("growl",), daemon=True).start() + self._face_person(target.bbox, image.width) + + else: + # ── ALERT: face person, decide whether to bark ── + self._state = "ALERT" + + # Face if within range + if dist is None or dist <= _FACE_RANGE: + self._face_person(target.bbox, image.width) + else: + self.cmd_vel.publish(Twist.zero()) + + # Bark logic by threat level + if now - last_bark >= _BARK_COOLDOWN: + if threat >= _AGGRESSIVE_THRESHOLD: + # Aggressive: always bark on sight + last_bark = now + sound = "deepbark" if random.random() < 0.5 else "bark" + Thread(target=_play_sound, args=(sound,), daemon=True).start() + elif threat >= _CAUTIOUS_THRESHOLD: + # Cautious: 40% chance each cycle (at 10Hz → ~0.5Hz effective) + if random.random() < 0.04: + last_bark = now + Thread(target=_play_sound, args=("bark",), daemon=True).start() + else: + # Curious: occasional soft bark (10% per cycle) + if random.random() < 0.01: + last_bark = now + Thread(target=_play_sound, args=("bark",), daemon=True).start() + + # ── Annotate frame every 3 ticks for dashboard (~3fps) ── + if frame_count % 3 == 0: + try: + bgr = image.to_opencv() + det_list = [] + for p in persons: + tid = p.track_id + det_list.append({ + "bbox": p.bbox, + "track_id": tid, + "threat": self._threats.get(tid), + }) + write_annotated_frame(bgr, det_list, overlay_text=f"DOG MODE\n{self._state}") + write_state({"mode": "dog", "state": self._state, "persons": len(persons), + "threats": {str(k): round(v, 2) for k, v in self._threats.items()}}) + except Exception: + pass + + # Pace loop + sleep_dur = next_t - time.monotonic() + if sleep_dur > 0: + time.sleep(sleep_dur) + + self.cmd_vel.publish(Twist.zero()) + self._send_stop_explore() diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/find.py b/dimos/robot/unitree/go2/blueprints/hackathon/find.py new file mode 100644 index 0000000000..f26b99d818 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/hackathon/find.py @@ -0,0 +1,299 @@ +"""FindSkillContainer — explore + find anything using YOLO+CLIP in the background. + +Architecture: + - Declares explore_cmd / stop_explore_cmd Out streams → auto-wires to WavefrontFrontierExplorer + - Declares odom In stream → knows where it found the target + - YOLO+CLIP runs in background while the robot is moving + - Found → stops exploration, reports position, optionally hands off to SmartFollow +""" + +from __future__ import annotations + +import time +from threading import Event, RLock, Thread +from typing import Any + +from reactivex.disposable import Disposable + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.sensor_msgs.Image import Image +from dimos.utils.logging_config import setup_logger +from dimos_lcm.std_msgs import Bool +from dimos.robot.unitree.go2.blueprints.hackathon.frame_writer import write_annotated_frame, write_state +from dimos.robot.unitree.go2.blueprints.hackathon.perception_loop import PerceptionLoopModule, set_active_mode + +logger = setup_logger() + +_MATCH_THRESHOLD = 0.22 + +# Full YOLO11 COCO class list — these skip CLIP, pure class filter +_YOLO_CLASSES = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", + "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", + "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", + "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", + "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", + "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", + "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", + "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", + "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", + "toothbrush", +} + + +class Config(ModuleConfig): + pass + + +class _Det: + """Wrap dict-returned detections from PerceptionLoop with the small attr + surface the rest of this module needs (.track_id / .bbox / .name).""" + __slots__ = ("track_id", "bbox", "name", "confidence") + def __init__(self, m: dict) -> None: + self.track_id = m.get("track_id", -1) + self.bbox = tuple(m["bbox"]) + self.name = m.get("name", "") + self.confidence = m.get("score", m.get("confidence", 0.0)) + def bbox_2d_volume(self) -> float: + x1, y1, x2, y2 = self.bbox + return max(0.0, x2 - x1) * max(0.0, y2 - y1) + + +class FindSkillContainer(Module): + """Find + explore skill. + + Starts autonomous frontier exploration while watching the camera for a target. + When found, stops exploration and reports position to Claude via tool_update. + Shares CLIP + YOLO with PerceptionLoop via RPC (no local model loads). + """ + + color_image: In[Image] + odom: In[PoseStamped] + # Shared inference layer — ONE MobileCLIP + ONE YOLO across all skills. + _perception: PerceptionLoopModule + + # These wire directly to WavefrontFrontierExplorer via name+type matching + explore_cmd: Out[Bool] + stop_explore_cmd: Out[Bool] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + # No own MobileCLIP / YOLO — both come from PerceptionLoop via RPC. + # Saves ~1.85 GB of duplicated model weights per skill module. + self._latest_image: Image | None = None + self._latest_odom: PoseStamped | None = None + self._thread: Thread | None = None + self._should_stop = Event() + self._lock = RLock() + + @rpc + def start(self) -> None: + super().start() + # CLIP loads lazily on first embed — eager start() caused concurrent + # multi-process model loads to stall startup. + self.register_disposable(Disposable(self.color_image.subscribe(self._on_image))) + self.register_disposable(Disposable(self.odom.subscribe(self._on_odom))) + + @rpc + def stop(self) -> None: + self._should_stop.set() + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._thread = None + super().stop() + + def _on_image(self, image: Image) -> None: + with self._lock: + self._latest_image = image + + def _on_odom(self, odom: PoseStamped) -> None: + with self._lock: + self._latest_odom = odom + + @skill + def smart_find(self, query: str, explore: bool = True, then_approach: bool = False, timeout_s: float = 0.0) -> str: + """Explore the environment while searching for a specific person or object. + + Starts autonomous frontier exploration and watches the camera for the target. + When found, stops exploration and reports back. If then_approach=True, the robot + will also move close to the object after finding it. + + Works for people ("person with red shirt"), YOLO objects ("chair", "dog", "bottle"), + or any visual description. + + Args: + query: What to look for, e.g. "red chair", "dog", "person with backpack" + explore: If True (default), start autonomous exploration while searching. + then_approach: If True, automatically approach and get close after finding. + timeout_s: Give up after this many seconds (0 = no timeout, runs until found or stopped). + + Returns: + Immediate confirmation. Will send tool_update when found or timed out. + """ + self._should_stop.set() + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._thread = None + self._should_stop.clear() + + with self._lock: + image = self._latest_image + + if image is None: + return "No camera image yet — is DimOS running?" + + if explore: + cmd = Bool() + cmd.data = True + self.explore_cmd.publish(cmd) + + set_active_mode("find") + self.start_tool("smart_find") + self._thread = Thread( + target=self._find_loop, + args=(query, explore, then_approach, timeout_s), + daemon=True, + name="SmartFind", + ) + self._thread.start() + + return ( + f"Searching for '{query}'. " + + ("Exploration started. " if explore else "Watching without moving. ") + + f"Timeout: {int(timeout_s)}s. Will update you when found." + ) + + @skill + def stop_find(self) -> str: + """Stop an active find/explore mission and halt exploration.""" + self._should_stop.set() + set_active_mode(None) + self._send_stop_explore() + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._thread = None + return "Find mission stopped." + + def _send_stop_explore(self) -> None: + cmd = Bool() + cmd.data = True + self.stop_explore_cmd.publish(cmd) + + def _get_detections(self, image: Image, class_filter: str | None = None) -> list: + """Pull detections from the shared PerceptionLoop (RPC).""" + try: + raw = self._perception.current_detections(class_filter=class_filter) + except Exception: + return [] + return [_Det(m) for m in raw] + + def _find_loop(self, query: str, exploring: bool, then_approach: bool, timeout_s: float) -> None: + class_mode = query.lower() in _YOLO_CLASSES + + start_time = time.monotonic() + frame_count = 0 + clip_interval = 5 + + unbounded = timeout_s <= 0 + while not self._should_stop.is_set(): + elapsed = time.monotonic() - start_time + if not unbounded and elapsed > timeout_s: + if exploring: + self._send_stop_explore() + self.tool_update("smart_find", f"Search timed out after {int(elapsed)}s. '{query}' not found.") + set_active_mode(None) + self.stop_tool("smart_find") + return + + with self._lock: + image = self._latest_image + odom = self._latest_odom + + if image is None: + time.sleep(0.1) + continue + + frame_count += 1 + + if class_mode: + matches = self._get_detections(image, class_filter=query.lower()) + if matches: + best = max(matches, key=lambda d: d.bbox_2d_volume()) + try: + all_dets = self._get_detections(image) + det_list = [{"bbox": d.bbox, "track_id": d.track_id, + "label": "FOUND" if d.track_id == best.track_id else None} + for d in all_dets] + write_annotated_frame(image.to_opencv(), det_list, overlay_text=f"FIND\nFOUND\n{query}") + write_state({"mode": "find", "state": "FOUND", "query": query}) + except Exception: + pass + set_active_mode(None) + self._report_found(query, best, odom, exploring, then_approach) + return + + elif frame_count % clip_interval == 0: + # Shared-CLIP scoring via PerceptionLoop — no local inference, + # no duplicate MobileCLIP load. + try: + ranked = self._perception.match_text(query) + except Exception: + ranked = [] + if ranked and ranked[0]["score"] >= _MATCH_THRESHOLD: + best = _Det(ranked[0]) + try: + all_dets = self._get_detections(image) + det_list = [{"bbox": d.bbox, "track_id": d.track_id, + "label": "FOUND" if d.track_id == best.track_id else None} + for d in all_dets] + write_annotated_frame(image.to_opencv(), det_list, overlay_text=f"FIND\nFOUND\n{query}") + write_state({"mode": "find", "state": "FOUND", "query": query}) + except Exception: + pass + set_active_mode(None) + self._report_found(query, best, odom, exploring, then_approach) + return + + # Annotate searching frames every 3 ticks + if frame_count % 3 == 0: + try: + cur_dets = self._get_detections(image) + det_list = [{"bbox": d.bbox, "track_id": d.track_id} for d in cur_dets] + write_annotated_frame(image.to_opencv(), det_list, overlay_text=f"FIND\nSEARCHING\n{query}") + write_state({"mode": "find", "state": "SEARCHING", "query": query}) + except Exception: + pass + + time.sleep(1.0 / 15.0) + + if exploring: + self._send_stop_explore() + set_active_mode(None) + self.tool_update("smart_find", f"Find mission for '{query}' was stopped.") + self.stop_tool("smart_find") + + def _report_found(self, query: str, detection: Any, odom: PoseStamped | None, exploring: bool, then_approach: bool) -> None: + if exploring: + self._send_stop_explore() + + pos_str = ( + f"x={odom.x:.2f}, y={odom.y:.2f}" if odom is not None else "position unknown" + ) + x1, y1, x2, y2 = detection.bbox + center_x = (x1 + x2) / 2 + img_w = detection.image.width if hasattr(detection, "image") else 1280 + side = "left" if center_x < img_w * 0.4 else "right" if center_x > img_w * 0.6 else "center" + + approach_hint = f" Now call smart_approach('{query}') to move close." if then_approach else "" + self.tool_update( + "smart_find", + f"Found '{query}'! Robot at {pos_str}, target is {side} of frame.{approach_hint}", + ) + self.stop_tool("smart_find") diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/frame_writer.py b/dimos/robot/unitree/go2/blueprints/hackathon/frame_writer.py new file mode 100644 index 0000000000..4c43878aa6 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/hackathon/frame_writer.py @@ -0,0 +1,97 @@ +"""Shared annotated frame writer for dashboard visualization. + +Modules call write_annotated_frame() to push an OpenCV-annotated JPEG to +/tmp/go2_hackathon_frame.jpg, which the dashboard MJPEG stream reads. +""" + +from __future__ import annotations + +import json +import time +from pathlib import Path +from typing import Any + +import cv2 +import numpy as np + +FRAME_PATH = Path("/tmp/go2_hackathon_frame.jpg") +STATE_PATH = Path("/tmp/go2_hackathon_state.json") + +# Threat level colors: BGR +_COLORS = { + "aggressive": (0, 0, 220), # red + "cautious": (0, 160, 255), # orange + "curious": (0, 200, 80), # green + "tracking": (255, 200, 0), # cyan + "approach": (255, 80, 200), # purple + "unknown": (180, 180, 180), # grey +} + +_FONT = cv2.FONT_HERSHEY_SIMPLEX +_FONT_SCALE = 0.5 +_THICKNESS = 1 + + +def _threat_label(threat: float | None) -> tuple[str, tuple]: + if threat is None: + return "tracking", _COLORS["tracking"] + if threat >= 0.7: + return f"THREAT {threat:.0%}", _COLORS["aggressive"] + if threat >= 0.4: + return f"cautious {threat:.0%}", _COLORS["cautious"] + return f"curious {threat:.0%}", _COLORS["curious"] + + +def write_annotated_frame( + image_bgr: np.ndarray, + detections: list[dict], # [{"bbox":(x1,y1,x2,y2), "track_id":int, "threat":float|None, "label":str}] + overlay_text: str = "", + quality: int = 75, +) -> None: + """Draw bboxes + labels on frame and write to shared JPEG path.""" + frame = image_bgr.copy() + h, w = frame.shape[:2] + + for det in detections: + x1, y1, x2, y2 = [int(v) for v in det["bbox"]] + tid = det.get("track_id", -1) + threat = det.get("threat") + label_text, color = _threat_label(threat) + custom = det.get("label") + if custom: + label_text = custom + + cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2) + + tag = f"#{tid} {label_text}" + (tw, th), _ = cv2.getTextSize(tag, _FONT, _FONT_SCALE, _THICKNESS) + ty = max(y1 - 6, th + 4) + cv2.rectangle(frame, (x1, ty - th - 4), (x1 + tw + 4, ty + 2), color, -1) + cv2.putText(frame, tag, (x1 + 2, ty), _FONT, _FONT_SCALE, (255, 255, 255), _THICKNESS, cv2.LINE_AA) + + # Overlay text (state label) in top-left + if overlay_text: + lines = overlay_text.split("\n") + y0 = 22 + for line in lines: + (lw, lh), _ = cv2.getTextSize(line, _FONT, 0.6, 2) + cv2.rectangle(frame, (8, y0 - lh - 4), (14 + lw, y0 + 4), (30, 30, 30), -1) + cv2.putText(frame, line, (10, y0), _FONT, 0.6, (0, 255, 180), 2, cv2.LINE_AA) + y0 += lh + 10 + + ok, buf = cv2.imencode(".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, quality]) + if ok: + FRAME_PATH.write_bytes(buf.tobytes()) + + +def write_state(state: dict[str, Any]) -> None: + """Write current module state as JSON for dashboard polling.""" + state["ts"] = time.time() + STATE_PATH.write_text(json.dumps(state)) + + +def read_state() -> dict[str, Any]: + try: + return json.loads(STATE_PATH.read_text()) + except Exception: + return {} diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/perception_loop.py b/dimos/robot/unitree/go2/blueprints/hackathon/perception_loop.py new file mode 100644 index 0000000000..cb5360e91c --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/hackathon/perception_loop.py @@ -0,0 +1,321 @@ +"""PerceptionLoopModule — always-on L2 shared perception layer. + +Runs YOLO at ~15fps. Maintains a rolling SceneBuffer (2s TTL) with MobileCLIP +embeddings. Exposes a process-singleton so smart_follow / find / dog_mode all +read from *one* YOLO pass instead of each running their own detector. + +Cache hierarchy role: + HDD Claude via MCP — novel situations only + SSD dimOS SQLite map — spatial memory + RAM BehaviorFSMs — smart_follow, dog_mode recovery + L2 This module — always-on YOLO+MobileCLIP, SceneBuffer <50ms + L1 VisualServoing2D — servo math <1ms + +Module-level API (used by smart_follow / find / dog_mode): + get_shared_detections() → (list[Detection], timestamp_float) + get_shared_buffer() → dict[track_id, TrackedObject] + set_active_mode(str|None) — active mode suppresses idle frame-writing +""" + +from __future__ import annotations + +import time +from dataclasses import dataclass, field +from threading import Event, RLock, Thread +from typing import Any + +import numpy as np +from reactivex.disposable import Disposable + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In +from dimos.models.embedding.mobileclip import MobileCLIPModel +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.perception.detection.detectors.yolo import Yolo2DDetector +from dimos.utils.logging_config import setup_logger +from dimos.robot.unitree.go2.blueprints.hackathon.frame_writer import write_annotated_frame, write_state + +logger = setup_logger() + +_BUFFER_TTL = 2.0 # seconds before a track expires +_CLIP_INTERVAL = 5 # MobileCLIP every N frames (~3fps on CPU) +_DET_FRESHNESS = 0.12 # shared detection max age in seconds +_LOOP_HZ = 15.0 +_CLIP_THRESHOLD = 0.20 + +# ── Process-singleton shared state ─────────────────────────────────────────── +_shared_lock = RLock() +_shared_detections: list = [] +_shared_det_ts: float = 0.0 +_shared_buffer: dict = {} # track_id → TrackedObject +_active_mode: str | None = None # "follow" | "find" | "dog" | None + + +def get_shared_detections() -> tuple[list, float]: + """Return (detections, monotonic_ts) from the shared YOLO pass.""" + with _shared_lock: + return list(_shared_detections), _shared_det_ts + + +def get_shared_buffer() -> dict: + """Return a snapshot of the current SceneBuffer.""" + with _shared_lock: + return dict(_shared_buffer) + + +def set_active_mode(mode: str | None) -> None: + """Signal that a behaviour module is active (suppresses idle frame-writing).""" + global _active_mode + with _shared_lock: + _active_mode = mode + + +def shared_detections_fresh() -> bool: + """True if the shared YOLO pass ran within the freshness window.""" + with _shared_lock: + return (time.monotonic() - _shared_det_ts) <= _DET_FRESHNESS + + +# ───────────────────────────────────────────────────────────────────────────── + + +@dataclass +class TrackedObject: + track_id: int + name: str + bbox: tuple + confidence: float + last_seen: float + clip_emb: Any | None = field(default=None, repr=False) # Embedding | None + + +class Config(ModuleConfig): + camera_info: CameraInfo + + +class PerceptionLoopModule(Module): + """Always-on L2 perception: YOLO+MobileCLIP, shared SceneBuffer, query_scene skill.""" + + config: Config + color_image: In[Image] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._detector = Yolo2DDetector() + self._clip = MobileCLIPModel() + self._lock = RLock() + self._latest_image: Image | None = None + self._last_fast_write: float = 0.0 # gate for _on_image dashboard writes + self._should_stop = Event() + self._thread: Thread | None = None + + @rpc + def start(self) -> None: + super().start() + self.register_disposable(Disposable(self.color_image.subscribe(self._on_image))) + self._should_stop.clear() + self._thread = Thread(target=self._perception_loop, daemon=True, name="PerceptionLoop") + self._thread.start() + logger.info("PerceptionLoopModule started.") + + @rpc + def stop(self) -> None: + self._should_stop.set() + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._thread = None + super().stop() + + def _on_image(self, image: Image) -> None: + with self._lock: + self._latest_image = image + + # ── Shared-CLIP RPCs (so SmartFollow/Find don't each load their own) ── + + @rpc + def embed_text(self, text: str) -> list[float]: + """Embed a text query using the SHARED MobileCLIP (no duplicate model loads). + Returns the embedding as a plain list[float] (RPC-serializable).""" + emb = self._clip.embed_text(text) + return emb.vector.detach().cpu().float().tolist() + + @rpc + def match_text(self, text: str, class_filter: str | None = None) -> list[dict]: + """Score tracked objects against `text` using the shared CLIP. + Returns [{track_id, bbox, name, score}] sorted by score desc. + Uses cached track embeddings — zero extra inference per frame. + Optional class_filter narrows to a YOLO class (e.g. 'person').""" + text_emb = self._clip.embed_text(text) + now = time.monotonic() + out = [] + with _shared_lock: + for t in _shared_buffer.values(): + if now - t.last_seen > _BUFFER_TTL: + continue + if class_filter and t.name.lower() != class_filter.lower(): + continue + if t.clip_emb is None: + continue + try: + score = float(text_emb @ t.clip_emb) + except Exception: + continue + out.append({ + "track_id": int(t.track_id), + "bbox": [float(v) for v in t.bbox], + "name": t.name, + "score": score, + }) + out.sort(key=lambda m: m["score"], reverse=True) + return out + + @rpc + def current_detections(self, class_filter: str | None = None) -> list[dict]: + """Snapshot of the live YOLO detections (fresh-window only). + Returns [{track_id, bbox, name, confidence}].""" + with _shared_lock: + dets = list(_shared_detections) + ts = _shared_det_ts + if (time.monotonic() - ts) > _DET_FRESHNESS: + return [] + out = [] + for d in dets: + name = getattr(d, "name", "") + if class_filter and name.lower() != class_filter.lower(): + continue + bbox = getattr(d, "bbox", None) + if bbox is None: + continue + out.append({ + "track_id": int(getattr(d, "track_id", -1) or -1), + "bbox": [float(v) for v in bbox], + "name": name, + "confidence": float(getattr(d, "confidence", 0.0) or 0.0), + }) + return out + + # ── Skill ───────────────────────────────────────────────────────────────── + + @skill + def query_scene(self, description: str = "") -> str: + """Ask what the robot's camera currently sees — no new inference triggered. + + Reads directly from the live SceneBuffer (updated at ~15fps). Use this + before making follow/find decisions — it's cheap and instant. + + Args: + description: Optional text to filter by similarity (e.g. "person with + red shirt", "dog", "chair"). Leave empty to list everything. + """ + now = time.monotonic() + with _shared_lock: + tracks = [t for t in _shared_buffer.values() if now - t.last_seen <= _BUFFER_TTL] + + if not tracks: + return "Nothing detected in current frame." + + if description: + text_emb = self._clip.embed_text(description) + scored = [] + for t in tracks: + if t.clip_emb is not None: + score = float(text_emb @ t.clip_emb) + scored.append((score, t)) + scored.sort(key=lambda x: x[0], reverse=True) + matches = [(s, t) for s, t in scored if s >= _CLIP_THRESHOLD] + if not matches: + names = ", ".join(sorted({t.name for t in tracks})) + return f"No match for '{description}'. Visible: {names}." + lines = [f"Matches for '{description}':"] + for score, t in matches[:5]: + x1, _, x2, _ = t.bbox + cx = (x1 + x2) / 2 + side = "left" if cx < 0.4 else "right" if cx > 0.6 else "center" + lines.append(f" #{t.track_id} {t.name} ({score:.0%}) — {side} of frame") + return "\n".join(lines) + + by_class: dict[str, list] = {} + for t in tracks: + by_class.setdefault(t.name, []).append(t) + lines = [f"{len(tracks)} object(s) visible:"] + for name, objs in sorted(by_class.items()): + ids = ", ".join(f"#{o.track_id}" for o in objs) + lines.append(f" {name} × {len(objs)} [{ids}]") + return "\n".join(lines) + + # ── Main loop ───────────────────────────────────────────────────────────── + + def _perception_loop(self) -> None: + global _shared_detections, _shared_det_ts, _shared_buffer + + frame_count = 0 + period = 1.0 / _LOOP_HZ + next_t = time.monotonic() + + while not self._should_stop.is_set(): + next_t += period + frame_count += 1 + now = time.monotonic() + + with self._lock: + image = self._latest_image + + if image is None: + time.sleep(period) + continue + + # YOLO every frame + result = self._detector.process_image(image) + valid = [d for d in result.detections if d.is_valid()] + + # MobileCLIP every N frames + embs: list = [None] * len(valid) + if valid and frame_count % _CLIP_INTERVAL == 0: + try: + crops = [d.cropped_image() for d in valid] + raw = self._clip.embed(*crops) if len(crops) > 1 else [self._clip.embed(crops[0])] + embs = raw + except Exception: + pass + + # Write into shared singleton + with _shared_lock: + _shared_detections = valid + _shared_det_ts = now + for det, emb in zip(valid, embs): + tid = det.track_id + existing = _shared_buffer.get(tid) + _shared_buffer[tid] = TrackedObject( + track_id = tid, + name = getattr(det, "name", "object"), + bbox = det.bbox, + confidence = float(getattr(det, "confidence", 0.0)), + last_seen = now, + clip_emb = emb if emb is not None else (existing.clip_emb if existing else None), + ) + _shared_buffer = {k: v for k, v in _shared_buffer.items() + if now - v.last_seen <= _BUFFER_TTL} + + with _shared_lock: + mode = _active_mode + if mode is None: + try: + snap = [t for t in _shared_buffer.values() if now - t.last_seen <= _BUFFER_TTL] + det_list = [{"bbox": t.bbox, "track_id": t.track_id, "label": t.name} for t in snap] + write_annotated_frame(image.to_opencv(), det_list, overlay_text="PERCEPTION\nACTIVE") + write_state({ + "mode": "perception", + "state": "ACTIVE", + "objects": len(det_list), + "classes": list({t.name for t in snap}), + }) + except Exception: + pass + + sleep_dur = next_t - time.monotonic() + if sleep_dur > 0: + time.sleep(sleep_dur) diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/smart_follow.py b/dimos/robot/unitree/go2/blueprints/hackathon/smart_follow.py new file mode 100644 index 0000000000..1c8eeb6c8d --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/hackathon/smart_follow.py @@ -0,0 +1,428 @@ +"""SmartFollowSkillContainer — multi-target YOLO+MobileCLIP person/object follower. + +L2: reads shared YOLO detections from PerceptionLoopModule (zero duplicate inference). + Falls back to its own Yolo2DDetector if the shared pass is stale. +RAM: active_track_ids set, SpinSearch recovery, nearest-target servo. +L1: VisualServoing2D control loop. + +Design: +- YOLO servo uses shared detections (already running in PerceptionLoopModule) +- CLIP re-scoring reads embeddings from SceneBuffer — no redundant CLIP calls +- Multi-target: servo toward nearest active match (largest bbox = closest) +- Class-only fast path: if query is a YOLO class name, skip CLIP entirely +""" + +from __future__ import annotations + +import time +from threading import Event, RLock, Thread +from typing import Any + +from reactivex.disposable import Disposable + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.robot.unitree.go2.blueprints.hackathon.perception_loop import PerceptionLoopModule +from dimos.msgs.geometry_msgs.Twist import Twist +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.navigation.visual_servoing.visual_servoing_2d import VisualServoing2D +from dimos.utils.logging_config import setup_logger +from dimos.robot.unitree.go2.blueprints.hackathon.frame_writer import write_annotated_frame, write_state +from dimos.robot.unitree.go2.blueprints.hackathon.perception_loop import get_shared_buffer, set_active_mode + +logger = setup_logger() + +_YOLO_CLASSES = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", + "boat", "cat", "dog", "horse", "sheep", "cow", "bear", "zebra", "giraffe", + "backpack", "umbrella", "handbag", "tie", "suitcase", "sports ball", "bottle", + "cup", "chair", "couch", "potted plant", "bed", "tv", "laptop", "phone", + "keyboard", "mouse", "remote", "microwave", "oven", "sink", "refrigerator", "book", +} + +_MATCH_THRESHOLD = 0.22 +_REACQUIRE_THRESHOLD = 0.20 + + +class Config(ModuleConfig): + camera_info: CameraInfo + + +class _Det: + """Lightweight wrapper around dict-returned detections from PerceptionLoop. + Exposes the .track_id / .bbox / .name / .bbox_2d_volume() API the rest of + this module already expects, so we can swap detector source without + rewriting the follow loop.""" + __slots__ = ("track_id", "bbox", "name", "confidence") + def __init__(self, m: dict) -> None: + self.track_id = m.get("track_id", -1) + self.bbox = tuple(m["bbox"]) + self.name = m.get("name", "") + self.confidence = m.get("score", m.get("confidence", 0.0)) + def bbox_2d_volume(self) -> float: + x1, y1, x2, y2 = self.bbox + return max(0.0, x2 - x1) * max(0.0, y2 - y1) + def is_valid(self) -> bool: + return True + + +class SmartFollowSkillContainer(Module): + """Multi-target person/object follower using shared L2 perception.""" + + config: Config + color_image: In[Image] + cmd_vel: Out[Twist] + # Shared inference: ONE MobileCLIP + ONE YOLO lives in PerceptionLoop. + # We don't load our own; all detection + text matching is RPC'd. + _perception: PerceptionLoopModule + + _frequency: float = 15.0 + _clip_interval: int = 5 # re-score from SceneBuffer every N frames + _max_lost_frames: int = 20 + _spin_speed: float = 0.45 + _spin_timeout_s: float = 12.0 + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + # No more own MobileCLIP / Yolo2DDetector — both come from PerceptionLoop + # via RPC. Saves ~1.8 GB of duplicated model weights per skill module. + self._visual_servo = VisualServoing2D(self.config.camera_info) + + self._latest_image: Image | None = None + self._active_ids: set[int] = set() + self._query: str = "" + self._text_emb = None + self._class_mode: bool = False + self._class_name: str = "" + + self._thread: Thread | None = None + self._should_stop = Event() + self._lock = RLock() + self._tool_name: str = "smart_follow_person" + + @rpc + def start(self) -> None: + super().start() + # CLIP loads lazily on first embed (see _clip cached model) — eager + # start() here caused concurrent multi-process model loads to stall. + self.register_disposable(Disposable(self.color_image.subscribe(self._on_image))) + + @rpc + def stop(self) -> None: + self._should_stop.set() + set_active_mode(None) + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._thread = None + super().stop() + + def _on_image(self, image: Image) -> None: + with self._lock: + self._latest_image = image + + def _get_detections(self, image: Image, class_filter: str | None = None) -> list: + """Pull detections from the shared PerceptionLoop (RPC). image param + retained for signature compatibility; not used now (PerceptionLoop has + its own subscription to the same color_image stream).""" + try: + raw = self._perception.current_detections(class_filter=class_filter) + except Exception: + return [] + return [_Det(m) for m in raw] + + # ── Skills ──────────────────────────────────────────────────────────────── + + @skill + def smart_follow_person(self, query: str) -> str: + """Follow one or more people matching a description. + + YOLO+MobileCLIP instead of QwenVL — faster lock-on, auto SpinSearch recovery. + If multiple people match, follows the nearest. Switches target automatically. + + Args: + query: e.g. "gray pants", "black shirt", "woman with backpack" + """ + return self._start_follow(query, mode="person") + + @skill + def smart_follow_object(self, class_name: str) -> str: + """Follow any YOLO-detectable object by class name (no CLIP — pure fast path). + + Args: + class_name: YOLO class: "dog", "cat", "chair", "bottle", "person", etc. + """ + return self._start_follow(class_name, mode="object") + + @skill + def smart_approach(self, query: str, stop_distance_m: float = 0.8) -> str: + """Move toward a person or object and stop when close enough. + + Args: + query: Description or YOLO class name. + stop_distance_m: Stop distance in meters (default 0.8m). + """ + self._should_stop.set() + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._should_stop.clear() + + with self._lock: + image = self._latest_image + if image is None: + return "No camera image yet." + + if query.lower() in _YOLO_CLASSES: + self._class_mode = True + self._class_name = query.lower() + self._text_emb = None + self._query = query + matches = self._get_detections(image, class_filter=self._class_name) + else: + self._class_mode = False + self._query = query + self._text_emb = None # PerceptionLoop holds the embedding + ranked = self._perception.match_text(query, class_filter="person") + matches = [_Det(m) for m in ranked if m["score"] >= _MATCH_THRESHOLD] + if not matches: + return "No persons matching that description in current scene." + + if not matches: + return f"Cannot see '{query}' — use smart_find first." + + self._active_ids = {d.track_id for d in matches} + set_active_mode("follow") + self.start_tool("smart_approach") + self._thread = Thread(target=self._approach_loop, args=(stop_distance_m,), daemon=True, name="SmartApproach") + self._thread.start() + return f"Approaching '{query}', stopping at {stop_distance_m}m." + + @skill + def stop_following(self) -> str: + """Stop following or approaching.""" + self._should_stop.set() + set_active_mode(None) + self.cmd_vel.publish(Twist.zero()) + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._thread = None + return "Stopped." + + # ── Loops ───────────────────────────────────────────────────────────────── + + def _approach_loop(self, stop_distance_m: float) -> None: + lost_count = 0 + period = 1.0 / self._frequency + next_time = time.monotonic() + + while not self._should_stop.is_set(): + next_time += period + with self._lock: + image = self._latest_image + if image is None: + time.sleep(period) + continue + + if self._class_mode: + candidates = self._get_detections(image, class_filter=self._class_name) + self._active_ids = {d.track_id for d in candidates} + else: + candidates = self._get_detections(image, class_filter="person") + + active = [d for d in candidates if d.track_id in self._active_ids] + + if active: + lost_count = 0 + target = max(active, key=lambda d: d.bbox_2d_volume()) + dist = self._visual_servo._estimate_distance(target.bbox) + if dist is not None and dist <= stop_distance_m: + self.cmd_vel.publish(Twist.zero()) + self.tool_update("smart_approach", f"Arrived. Distance: {dist:.2f}m.") + self.stop_tool("smart_approach") + set_active_mode(None) + return + self.cmd_vel.publish(self._visual_servo.compute_twist(target.bbox, image.width)) + else: + lost_count += 1 + self.cmd_vel.publish(Twist.zero()) + if lost_count > self._max_lost_frames * 2: + self.tool_update("smart_approach", "Lost target during approach.") + self.stop_tool("smart_approach") + set_active_mode(None) + return + + sleep_dur = next_time - time.monotonic() + if sleep_dur > 0: + time.sleep(sleep_dur) + + self.cmd_vel.publish(Twist.zero()) + set_active_mode(None) + self.stop_tool("smart_approach") + + def _start_follow(self, query: str, mode: str) -> str: + self._should_stop.set() + if self._thread is not None: + self._thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + self._should_stop.clear() + + with self._lock: + image = self._latest_image + if image is None: + return "No camera image yet — is DimOS running?" + + if mode == "object" or query.lower() in _YOLO_CLASSES: + self._class_mode = True + self._class_name = query.lower() + self._text_emb = None + self._query = query + # Use the SceneBuffer (2s TTL) instead of current_detections() (120ms TTL) + # so MCP round-trip + thread-join latency don't produce a false "not visible" + # result for objects that were clearly in frame when the user clicked. + _now = time.monotonic() + _buf = get_shared_buffer() + _cls = query.lower() + buf_matches = [t for t in _buf.values() + if t.name.lower() == _cls and (_now - t.last_seen) <= 2.0] + self._active_ids = {t.track_id for t in buf_matches} + self._tool_name = "smart_follow_object" + set_active_mode("follow") + self.start_tool(self._tool_name) + self._thread = Thread(target=self._follow_loop, daemon=True, name="SmartFollow") + self._thread.start() + if not buf_matches: + return f"No '{query}' visible — will search when spinning." + return f"Tracking {len(buf_matches)} '{query}' (ids={list(self._active_ids)}). Following nearest." + + # Person mode: use shared CLIP via PerceptionLoop.match_text — scores + # cached track embeddings (zero extra inference) and returns ranked + # candidates with bbox/score. Same effective semantics as the old + # SceneBuffer fast path, just across-process instead of broken local. + self._class_mode = False + self._query = query + self._text_emb = None + + ranked = self._perception.match_text(query, class_filter="person") + matches = [_Det(m) for m in ranked if m["score"] >= _MATCH_THRESHOLD] + self._active_ids = {d.track_id for d in matches} + + self._tool_name = "smart_follow_person" + set_active_mode("follow") + self.start_tool(self._tool_name) + self._thread = Thread(target=self._follow_loop, daemon=True, name="SmartFollow") + self._thread.start() + + if not matches: + if ranked: + return (f"No match for '{query}' (best={ranked[0]['score']:.2f}, " + f"need {_MATCH_THRESHOLD}). Spinning to search.") + return "No persons in frame — spinning to search." + return f"Locked {len(matches)} match(es) for '{query}' (ids={list(self._active_ids)})." + + def _follow_loop(self) -> None: + lost_count = 0 + spin_start: float | None = None + frame_count = 0 + period = 1.0 / self._frequency + next_time = time.monotonic() + + while not self._should_stop.is_set(): + next_time += period + frame_count += 1 + + with self._lock: + image = self._latest_image + if image is None: + time.sleep(period) + continue + + # Get detections — shared YOLO pass when fresh + if self._class_mode: + candidates = self._get_detections(image, class_filter=self._class_name) + self._active_ids = {d.track_id for d in candidates} + else: + candidates = self._get_detections(image, class_filter="person") + # Re-score conditions: + # 1) IMMEDIATE if we have candidates but none of them match our + # current active_ids. This is what stops the 360 spin the + # moment the target comes into view (was: only every 5 + # frames, so the bot would happily rotate past the target). + # 2) Periodic during steady tracking to keep active_ids + # refreshed against shape/lighting changes. + _none_active_visible = candidates and not any( + d.track_id in self._active_ids for d in candidates + ) + _should_rescore = candidates and self._query and ( + _none_active_visible or frame_count % self._clip_interval == 0 + ) + if _should_rescore: + try: + ranked = self._perception.match_text(self._query, class_filter="person") + except Exception: + ranked = [] + if ranked: + new_active = {m["track_id"] for m in ranked if m["score"] >= _MATCH_THRESHOLD} + ranked_ids = {m["track_id"] for m in ranked} + # Only swap if PerceptionLoop has scored any of our candidates. + if any(d.track_id in ranked_ids for d in candidates): + self._active_ids = new_active + + active = [d for d in candidates if d.track_id in self._active_ids] + # If we just (re)acquired the target while spinning, kill the spin + # immediately — don't finish the rotation before reacting. + if active and spin_start is not None: + spin_start = None + lost_count = 0 + self.cmd_vel.publish(Twist.zero()) + + if active: + target = max(active, key=lambda d: d.bbox_2d_volume()) + self.cmd_vel.publish(self._visual_servo.compute_twist(target.bbox, image.width)) + lost_count = 0 + spin_start = None + else: + lost_count += 1 + self.cmd_vel.publish(Twist.zero()) + + if lost_count > self._max_lost_frames: + if spin_start is None: + spin_start = time.monotonic() + self.tool_update(self._tool_name, f"Lost '{self._query}' — spinning to search.") + + if time.monotonic() - spin_start > self._spin_timeout_s: + self.tool_update(self._tool_name, + f"Full 360° search failed. Could not re-acquire '{self._query}'.") + self.stop_tool(self._tool_name) + set_active_mode(None) + self.cmd_vel.publish(Twist.zero()) + return + + self.cmd_vel.publish( + Twist(linear=Vector3(0.0, 0.0, 0.0), angular=Vector3(0.0, 0.0, self._spin_speed)) + ) + + # Dashboard frame every 3 ticks + if frame_count % 3 == 0: + try: + state_label = "LOST" if lost_count > self._max_lost_frames else "FOLLOWING" + det_list = [{"bbox": d.bbox, "track_id": d.track_id, + "label": "TARGET" if d.track_id in self._active_ids else None} + for d in candidates] + write_annotated_frame(image.to_opencv(), det_list, + overlay_text=f"FOLLOW\n{state_label}\n{self._query}") + write_state({"mode": "follow", "state": state_label, + "query": self._query, "targets": len(active)}) + except Exception: + pass + + sleep_dur = next_time - time.monotonic() + if sleep_dur > 0: + time.sleep(sleep_dur) + + self.cmd_vel.publish(Twist.zero()) + set_active_mode(None) + self.tool_update(self._tool_name, "Following stopped.") + self.stop_tool(self._tool_name) diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/sounds/deepbark.mp3 b/dimos/robot/unitree/go2/blueprints/hackathon/sounds/deepbark.mp3 new file mode 100644 index 0000000000..bc57834328 Binary files /dev/null and b/dimos/robot/unitree/go2/blueprints/hackathon/sounds/deepbark.mp3 differ diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/sounds/dog-growling-and-barking.mp3 b/dimos/robot/unitree/go2/blueprints/hackathon/sounds/dog-growling-and-barking.mp3 new file mode 100644 index 0000000000..e937653895 Binary files /dev/null and b/dimos/robot/unitree/go2/blueprints/hackathon/sounds/dog-growling-and-barking.mp3 differ diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/sounds/minecraft-dog-bark.mp3 b/dimos/robot/unitree/go2/blueprints/hackathon/sounds/minecraft-dog-bark.mp3 new file mode 100644 index 0000000000..6bcb504bdc Binary files /dev/null and b/dimos/robot/unitree/go2/blueprints/hackathon/sounds/minecraft-dog-bark.mp3 differ diff --git a/dimos/robot/unitree/go2/blueprints/hackathon/sounds/perry-the-platypuss-growl.mp3 b/dimos/robot/unitree/go2/blueprints/hackathon/sounds/perry-the-platypuss-growl.mp3 new file mode 100644 index 0000000000..efa6f0fbe5 Binary files /dev/null and b/dimos/robot/unitree/go2/blueprints/hackathon/sounds/perry-the-platypuss-growl.mp3 differ