From b80e3c1710791c7b1eb3cbebf49b8a42b5264caa Mon Sep 17 00:00:00 2001 From: bogwi Date: Sun, 24 May 2026 16:35:46 +0900 Subject: [PATCH 01/29] Forward SHM transports to Rerun and unify Go2 replay IPC --- dimos/constants.py | 5 ++ dimos/core/coordination/module_coordinator.py | 37 +++++++++++- dimos/core/transport.py | 36 +++++++++++- .../go2/blueprints/basic/unitree_go2_basic.py | 38 ++++++++---- dimos/visualization/rerun/bridge.py | 58 ++++++++++++++++++- 5 files changed, 157 insertions(+), 17 deletions(-) diff --git a/dimos/constants.py b/dimos/constants.py index d849f4aaf3..5b83bce636 100644 --- a/dimos/constants.py +++ b/dimos/constants.py @@ -45,6 +45,11 @@ DEFAULT_CAPACITY_COLOR_IMAGE = 1920 * 1080 * 3 # Default depth image size: 1280x720 frame * 4 (float32 size) DEFAULT_CAPACITY_DEPTH_IMAGE = 1280 * 720 * 4 +# Fixed-capacity SHM channels must be sized before the first message arrives. +# These defaults cover current Go2 replay and navigation payloads while keeping +# large local streams off UDP multicast. +DEFAULT_CAPACITY_POINTCLOUD = 64 * 1024 * 1024 +DEFAULT_CAPACITY_OCCUPANCY_GRID = 16 * 1024 * 1024 # From https://github.com/lcm-proj/lcm.git LCM_MAX_CHANNEL_NAME_LENGTH = 63 diff --git a/dimos/core/coordination/module_coordinator.py b/dimos/core/coordination/module_coordinator.py index 60c41a6ba1..d0593b8e02 100644 --- a/dimos/core/coordination/module_coordinator.py +++ b/dimos/core/coordination/module_coordinator.py @@ -30,7 +30,14 @@ from dimos.core.global_config import GlobalConfig, global_config from dimos.core.module import ModuleBase, ModuleSpec from dimos.core.resource import Resource -from dimos.core.transport import LCMTransport, PubSubTransport, pLCMTransport +from dimos.core.transport import ( + JpegShmTransport, + LCMTransport, + PubSubTransport, + SHMTransport, + pLCMTransport, + pSHMTransport, +) from dimos.spec.utils import is_spec, spec_annotation_compliance, spec_structural_compliance from dimos.utils.generic import short_id from dimos.utils.logging_config import setup_logger @@ -279,6 +286,9 @@ def _connect_streams(self, blueprint: Blueprint) -> None: module=module.__name__, transport=transport.__class__.__name__, ) + # SHM streams are concrete transport objects, not LCM topics. Forward + # them to Rerun after stream wiring has resolved the transport registry. + _configure_rerun_bridge_visual_transports(self) @classmethod def build( @@ -584,6 +594,31 @@ def _get_transport_for(blueprint: Blueprint, name: str, stream_type: type) -> Pu return transport +def _configure_rerun_bridge_visual_transports(coordinator: ModuleCoordinator) -> None: + """Send resolved SHM transports to an active Rerun bridge. + + RerunBridgeModule subscribes to configured pubsubs directly. For SHM + streams, the coordinator forwards the concrete transport objects after + stream wiring has selected them. + """ + from dimos.visualization.rerun.bridge import RerunBridgeModule + + if RerunBridgeModule not in coordinator._deployed_modules: + return + + # LCM transports are already visible through RerunBridgeModule.config.pubsubs. + transports = [ + transport + for transport in coordinator._transport_registry.values() + if isinstance(transport, SHMTransport | pSHMTransport | JpegShmTransport) + ] + if not transports: + return + + bridge = coordinator.get_instance(RerunBridgeModule) + bridge.set_visual_transports(transports) + + def _verify_no_name_conflicts(blueprint: Blueprint) -> None: name_to_types: dict[Any, set[type]] = defaultdict(set) name_to_modules: dict[Any, list[tuple[type, type]]] = defaultdict(list) diff --git a/dimos/core/transport.py b/dimos/core/transport.py index 6435003758..de2ced9cbb 100644 --- a/dimos/core/transport.py +++ b/dimos/core/transport.py @@ -163,14 +163,23 @@ def stop(self) -> None: class pSHMTransport(PubSubTransport[T]): + """Pickled shared-memory transport for local Python object streams.""" + _started: bool = False def __init__(self, topic: str, **kwargs) -> None: # type: ignore[no-untyped-def] super().__init__(topic) + self._kwargs = kwargs self.shm = PickleSharedMemory(**kwargs) def __reduce__(self): # type: ignore[no-untyped-def] - return (pSHMTransport, (self.topic,)) + # Preserve sizing options such as default_capacity when the coordinator + # sends this transport to workers or to Rerun. + return (pSHMTransport, (self.topic,), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + self._kwargs = state + self.shm = PickleSharedMemory(**state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: @@ -193,14 +202,23 @@ def stop(self) -> None: class SHMTransport(PubSubTransport[T]): + """Raw bytes shared-memory transport for local fixed-size payloads.""" + _started: bool = False def __init__(self, topic: str, **kwargs) -> None: # type: ignore[no-untyped-def] super().__init__(topic) + self._kwargs = kwargs self.shm = BytesSharedMemory(**kwargs) def __reduce__(self): # type: ignore[no-untyped-def] - return (SHMTransport, (self.topic,)) + # Preserve sizing options such as default_capacity when the coordinator + # sends this transport to workers or to Rerun. + return (SHMTransport, (self.topic,), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + self._kwargs = state + self.shm = BytesSharedMemory(**state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: @@ -223,6 +241,8 @@ def stop(self) -> None: class JpegShmTransport(PubSubTransport[T]): + """JPEG-compressed shared-memory transport for local image streams.""" + _started: bool = False def __init__(self, topic: str, quality: int = 75, **kwargs) -> None: # type: ignore[no-untyped-def] @@ -233,9 +253,19 @@ def __init__(self, topic: str, quality: int = 75, **kwargs) -> None: # type: ig self.shm = JpegSharedMemory(quality=quality, **kwargs) self.quality = quality + self._kwargs = kwargs def __reduce__(self): # type: ignore[no-untyped-def] - return (JpegShmTransport, (self.topic, self.quality)) + # Preserve quality and sizing options when crossing worker boundaries. + return (JpegShmTransport, (self.topic, self.quality), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + from dimos.protocol.pubsub.impl.jpeg_shm import ( + JpegSharedMemory, + ) # deferred to avoid pulling in Image/cv2/rerun + + self._kwargs = state + self.shm = JpegSharedMemory(quality=self.quality, **state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: diff --git a/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py b/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py index 96a291163d..ea315119c5 100644 --- a/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py +++ b/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py @@ -14,29 +14,47 @@ # See the License for the specific language governing permissions and # limitations under the License. -import platform from typing import Any -from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE +from dimos.constants import ( + DEFAULT_CAPACITY_COLOR_IMAGE, + DEFAULT_CAPACITY_OCCUPANCY_GRID, + DEFAULT_CAPACITY_POINTCLOUD, +) from dimos.core.coordination.blueprints import autoconnect from dimos.core.global_config import global_config from dimos.core.transport import pSHMTransport +from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 from dimos.robot.unitree.go2.connection import GO2Connection from dimos.visualization.vis_module import vis_module -# Mac has some issue with high bandwidth UDP, so we use pSHMTransport for color_image -# actually we can use pSHMTransport for all platforms, and for all streams -# TODO need a global transport toggle on blueprints/global config -_mac_transports: dict[tuple[str, type], pSHMTransport[Image]] = { +# Route large local replay and mapping streams through SHM on every platform. +# Small control/status streams continue to use the default LCM transport. +_local_high_bandwidth_transports: dict[tuple[str, type], pSHMTransport[Any]] = { ("color_image", Image): pSHMTransport( - "color_image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE + "/color_image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE + ), + ("lidar", PointCloud2): pSHMTransport("/lidar", default_capacity=DEFAULT_CAPACITY_POINTCLOUD), + ("pointcloud", PointCloud2): pSHMTransport( + "/pointcloud", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("global_map", PointCloud2): pSHMTransport( + "/global_map", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("merged_map", PointCloud2): pSHMTransport( + "/merged_map", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("global_costmap", OccupancyGrid): pSHMTransport( + "/global_costmap", default_capacity=DEFAULT_CAPACITY_OCCUPANCY_GRID + ), + ("navigation_costmap", OccupancyGrid): pSHMTransport( + "/navigation_costmap", default_capacity=DEFAULT_CAPACITY_OCCUPANCY_GRID ), } -_transports_base = ( - autoconnect() if platform.system() == "Linux" else autoconnect().transports(_mac_transports) -) +_transports_base = autoconnect().transports(_local_high_bandwidth_transports) def _convert_camera_info(camera_info: Any) -> Any: diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index 2f5fb1efa9..a60bccf8e1 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -43,6 +43,7 @@ from dimos.core.core import rpc from dimos.core.module import Module, ModuleConfig +from dimos.core.transport import PubSubTransport from dimos.protocol.pubsub.impl.lcmpubsub import LCM from dimos.protocol.pubsub.patterns import Glob, pattern_matches from dimos.protocol.pubsub.spec import SubscribeAllCapable @@ -164,7 +165,10 @@ def _default_blueprint() -> Blueprint: class Config(ModuleConfig): + # Pubsubs cover discoverable sources such as LCM. visual_transports is + # populated by the coordinator for concrete local streams such as SHM. pubsubs: list[SubscribeAllCapable[Any, Any]] = field(default_factory=lambda: [LCM()]) + visual_transports: list[PubSubTransport[Any]] = field(default_factory=list) visual_override: dict[Glob | str, Callable[[Any], Archetype] | None] = field( default_factory=dict @@ -186,10 +190,12 @@ class Config(ModuleConfig): class RerunBridgeModule(Module): - """Bridge that logs messages from pubsubs to Rerun. + """Bridge that logs transport messages to Rerun. - Spawns its own Rerun viewer and subscribes to all topics on each provided - pubsub. Any message that has a to_rerun() method is automatically logged. + Spawns its own Rerun viewer and subscribes to configured pubsubs and + explicit visual transports. Pubsubs cover discoverable transports such as + LCM; visual_transports covers concrete local transports such as SHM. + Any message that has a to_rerun() method is automatically logged. Example: from dimos.protocol.pubsub.impl.lcmpubsub import LCM @@ -215,6 +221,8 @@ def __init__(self, **kwargs: Any) -> None: self._last_log = {} self._override_cache: dict[str, Callable[[Any], RerunData | None]] = {} self._frame_attached: dict[str, str] = {} + self._subscribed_visual_transport_topics: set[str] = set() + self._started = False @property def host(self) -> str: @@ -265,13 +273,52 @@ def composed(msg: Any) -> RerunData | None: return composed def _get_entity_path(self, topic: Any) -> str: + """Map a transport topic to a Rerun entity path. + + LCM topics usually already include a leading slash and a type suffix. + SHM topics are plain strings. Normalize both forms so visual overrides + such as "world/color_image" match consistently. + """ if self.config.topic_to_entity: return self.config.topic_to_entity(topic) topic_str = getattr(topic, "name", None) or str(topic) topic_str = topic_str.split("#")[0] # strip LCM topic suffix + if not topic_str.startswith("/"): + topic_str = f"/{topic_str}" return f"{self.config.entity_prefix}{topic_str}" + @rpc + def set_visual_transports(self, transports: list[PubSubTransport[Any]]) -> None: + """Replace explicit visual transports and subscribe when running. + + The coordinator calls this after stream wiring and after loading + additional blueprints into an existing coordinator. + """ + self.config.visual_transports = transports + if self._started: + self._subscribe_visual_transports() + + def _subscribe_visual_transports(self) -> None: + """Attach to configured SHM streams once per topic.""" + for transport in self.config.visual_transports: + topic = str(getattr(transport, "topic", "")) + if not topic or topic in self._subscribed_visual_transport_topics: + continue + self._subscribed_visual_transport_topics.add(topic) + if hasattr(transport, "start"): + transport.start() + unsub = transport.subscribe( + # Capture the current topic so callbacks keep the correct + # entity path even as this loop advances to the next transport. + lambda msg, transport_topic=getattr(transport, "topic", topic): self._on_message( + msg, transport_topic + ) + ) + if unsub is not None: + self.register_disposable(Disposable(unsub)) + self.register_disposable(Disposable(transport.stop)) + def _on_message(self, msg: Any, topic: Any) -> None: """Handle incoming message - log to rerun.""" @@ -306,6 +353,7 @@ def _on_message(self, msg: Any, topic: Any) -> None: @rpc def start(self) -> None: super().start() + self._started = True logger.info("Rerun bridge starting") @@ -397,6 +445,8 @@ def start(self) -> None: unsub = pubsub.subscribe_all(self._on_message) self.register_disposable(Disposable(unsub)) + self._subscribe_visual_transports() + for pubsub in self.config.pubsubs: if hasattr(pubsub, "stop"): self.register_disposable(Disposable(pubsub.stop)) # type: ignore[union-attr] @@ -506,8 +556,10 @@ def log_blueprint_graph(self, dot_code: str, module_names: list[str]) -> None: @rpc def stop(self) -> None: + self._started = False self._override_cache.clear() self._frame_attached.clear() + self._subscribed_visual_transport_topics.clear() super().stop() From c48366d7954ba7d5e4f215dd51530f391ce74de5 Mon Sep 17 00:00:00 2001 From: bogwi Date: Mon, 25 May 2026 14:24:32 +0900 Subject: [PATCH 02/29] fix: mypy --- dimos/visualization/rerun/bridge.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index a60bccf8e1..5aceba6404 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -308,12 +308,15 @@ def _subscribe_visual_transports(self) -> None: self._subscribed_visual_transport_topics.add(topic) if hasattr(transport, "start"): transport.start() + transport_topic = getattr(transport, "topic", topic) + + def on_visual_message(msg: Any, transport_topic: Any = transport_topic) -> None: + self._on_message(msg, transport_topic) + unsub = transport.subscribe( # Capture the current topic so callbacks keep the correct # entity path even as this loop advances to the next transport. - lambda msg, transport_topic=getattr(transport, "topic", topic): self._on_message( - msg, transport_topic - ) + on_visual_message ) if unsub is not None: self.register_disposable(Disposable(unsub)) From 928c08f5b264ef67bb0b053f8a4a34b3f2c4c9d0 Mon Sep 17 00:00:00 2001 From: bogwi Date: Mon, 25 May 2026 14:30:22 +0900 Subject: [PATCH 03/29] fix: Greptile P1 --- dimos/visualization/rerun/bridge.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index 5aceba6404..eb00f23730 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -308,6 +308,8 @@ def _subscribe_visual_transports(self) -> None: self._subscribed_visual_transport_topics.add(topic) if hasattr(transport, "start"): transport.start() + # If subscribe raises, the bridge still owns cleanup for the transport it started. + self.register_disposable(Disposable(transport.stop)) transport_topic = getattr(transport, "topic", topic) def on_visual_message(msg: Any, transport_topic: Any = transport_topic) -> None: @@ -320,7 +322,6 @@ def on_visual_message(msg: Any, transport_topic: Any = transport_topic) -> None: ) if unsub is not None: self.register_disposable(Disposable(unsub)) - self.register_disposable(Disposable(transport.stop)) def _on_message(self, msg: Any, topic: Any) -> None: """Handle incoming message - log to rerun.""" From eb622d611acdff041cda77220435d747f36ec8b1 Mon Sep 17 00:00:00 2001 From: grmkris Date: Tue, 26 May 2026 23:08:23 +0800 Subject: [PATCH 04/29] feat: add gemini/local speak skills, agentic gemini Go2 blueprint, and capture-viewer tool Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/gemini_speak_skill.py | 156 ++++++++++ dimos/agents/skills/local_speak_skill.py | 122 ++++++++ .../memory/image_embedding.py | 62 +++- dimos/memory2/go2_full_recorder.py | 76 +++++ dimos/robot/all_blueprints.py | 4 + dimos/robot/test_all_blueprints_generation.py | 9 +- .../agentic/unitree_go2_agentic_gemini.py | 43 +++ dimos/stream/audio/tts/node_gemini.py | 248 ++++++++++++++++ pyproject.toml | 2 + tools/capture-viewer/README.md | 29 ++ tools/capture-viewer/server.ts | 281 ++++++++++++++++++ 11 files changed, 1030 insertions(+), 2 deletions(-) create mode 100644 dimos/agents/skills/gemini_speak_skill.py create mode 100644 dimos/agents/skills/local_speak_skill.py create mode 100644 dimos/memory2/go2_full_recorder.py create mode 100644 dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py create mode 100644 dimos/stream/audio/tts/node_gemini.py create mode 100644 tools/capture-viewer/README.md create mode 100644 tools/capture-viewer/server.ts diff --git a/dimos/agents/skills/gemini_speak_skill.py b/dimos/agents/skills/gemini_speak_skill.py new file mode 100644 index 0000000000..2d53202008 --- /dev/null +++ b/dimos/agents/skills/gemini_speak_skill.py @@ -0,0 +1,156 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A speak skill backed by Google's Gemini TTS API. + +Drop-in replacement for ``SpeakSkill`` that reuses ``GOOGLE_API_KEY`` (already +used for the LLM and embeddings) instead of an OpenAI key, and works on any +platform with an audio output device. Satisfies ``SpeakSkillSpec`` +(``speak(text, blocking=True) -> str``). +""" + +import threading +import time + +from reactivex import Subject + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.stream.audio.node_output import SounddeviceAudioOutput +from dimos.stream.audio.tts.node_gemini import GeminiTTSNode +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class GeminiSpeakSkillConfig(ModuleConfig): + # Prebuilt Gemini voice name (e.g. "Kore", "Puck", "Charon", "Aoede"). + voice: str = "Kore" + # Gemini TTS model; must be a `*-preview-tts` model to emit audio. + model: str = "gemini-2.5-flash-preview-tts" + + +class GeminiSpeakSkill(Module): + config: GeminiSpeakSkillConfig + _tts_node: GeminiTTSNode | None = None + _audio_output: SounddeviceAudioOutput | None = None + _audio_lock: threading.Lock = threading.Lock() + _bg_threads: list[threading.Thread] = [] + _bg_threads_lock: threading.Lock = threading.Lock() + + @rpc + def start(self) -> None: + super().start() + self._tts_node = GeminiTTSNode(voice=self.config.voice, model=self.config.model) + self._audio_output = SounddeviceAudioOutput(sample_rate=24000) + self._audio_output.consume_audio(self._tts_node.emit_audio()) + + @rpc + def stop(self) -> None: + with self._bg_threads_lock: + threads = list(self._bg_threads) + for t in threads: + t.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + if self._tts_node: + self._tts_node.dispose() + self._tts_node = None + if self._audio_output: + self._audio_output.stop() + self._audio_output = None + super().stop() + + @skill + def speak(self, text: str, blocking: bool = True) -> str: + """Speak text out loud through the robot's speakers. + + USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak. + + Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly. + + Example usage: + + speak("Hello, I am your robot assistant.") + """ + if self._tts_node is None: + return "Error: TTS not initialized" + + if not blocking: + thread = threading.Thread( + target=self._speak_bg, args=(text,), daemon=True, name="GeminiSpeakSkill-bg" + ) + with self._bg_threads_lock: + self._bg_threads.append(thread) + thread.start() + return f"Speaking (non-blocking): {text}" + + return self._speak_blocking(text) + + def _speak_bg(self, text: str) -> None: + try: + self._speak_blocking(text) + finally: + # Remove this thread from the list of background threads when done + with self._bg_threads_lock: + self._bg_threads = [ + t for t in self._bg_threads if t is not threading.current_thread() + ] + + def _speak_blocking(self, text: str) -> str: + # Use lock to prevent simultaneous speech + with self._audio_lock: + if self._tts_node is None: + return "Error: TTS not initialized" + + text_subject: Subject[str] = Subject() + audio_complete = threading.Event() + self._tts_node.consume_text(text_subject) + + def set_as_complete(_t: str) -> None: + audio_complete.set() + + def set_as_complete_e(_e: Exception) -> None: + audio_complete.set() + + subscription = self._tts_node.emit_text().subscribe( + on_next=set_as_complete, + on_error=set_as_complete_e, + ) + + text_subject.on_next(text) + text_subject.on_completed() + + # Gemini synthesis is a network round-trip; allow more headroom than + # the local-output time so first-token latency doesn't trip the wait. + timeout = max(15, len(text) * 0.1) + + if not audio_complete.wait(timeout=timeout): + logger.warning(f"TTS timeout reached for: {text}") + subscription.dispose() + return f"Warning: TTS timeout while speaking: {text}" + else: + # Small delay to ensure buffers flush + time.sleep(0.3) + + subscription.dispose() + + return f"Spoke: {text}" + + +if __name__ == "__main__": + skill_module = GeminiSpeakSkill() + skill_module.start() + print(skill_module.speak("Hello, I am your robot assistant, powered by Gemini.")) + skill_module.stop() diff --git a/dimos/agents/skills/local_speak_skill.py b/dimos/agents/skills/local_speak_skill.py new file mode 100644 index 0000000000..1af0e8666a --- /dev/null +++ b/dimos/agents/skills/local_speak_skill.py @@ -0,0 +1,122 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A speak skill backed by the macOS ``say`` command. + +Drop-in replacement for ``SpeakSkill`` that requires no OpenAI key and no +audio pipeline: ``say`` synthesizes and plays the audio itself. macOS only. +Satisfies ``SpeakSkillSpec`` (``speak(text, blocking=True) -> str``). +""" + +import shutil +import subprocess +import threading + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class LocalSpeakSkillConfig(ModuleConfig): + # macOS `say` voice name (e.g. "Daniel", "Samantha"); None = system default. + voice: str | None = None + # Speech rate in words per minute; None = `say` default (~175). + rate: int | None = None + + +class LocalSpeakSkill(Module): + """Speak text out loud through the local macOS ``say`` command.""" + + config: LocalSpeakSkillConfig + _bg_threads: list[threading.Thread] = [] + _bg_threads_lock: threading.Lock = threading.Lock() + + @rpc + def start(self) -> None: + super().start() + if shutil.which("say") is None: + logger.warning( + "LocalSpeakSkill: `say` not found on PATH; speak() will no-op. " + "This skill is macOS-only." + ) + + @rpc + def stop(self) -> None: + with self._bg_threads_lock: + threads = list(self._bg_threads) + for t in threads: + t.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + super().stop() + + def _command(self, text: str) -> list[str]: + cmd = ["say"] + if self.config.voice: + cmd += ["-v", self.config.voice] + if self.config.rate: + cmd += ["-r", str(self.config.rate)] + cmd.append(text) + return cmd + + @skill + def speak(self, text: str, blocking: bool = True) -> str: + """Speak text out loud through the robot's speakers. + + USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak. + + Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly. + + Example usage: + + speak("Hello, I am your robot assistant.") + """ + if shutil.which("say") is None: + return "Error: `say` command not available (LocalSpeakSkill is macOS-only)" + + if not text.strip(): + return "Error: nothing to speak" + + if not blocking: + thread = threading.Thread( + target=self._speak_blocking, args=(text,), daemon=True, name="LocalSpeakSkill-bg" + ) + with self._bg_threads_lock: + self._bg_threads.append(thread) + thread.start() + return f"Speaking (non-blocking): {text}" + + return self._speak_blocking(text) + + def _speak_blocking(self, text: str) -> str: + try: + subprocess.run(self._command(text), check=True) + except subprocess.CalledProcessError as e: + logger.error(f"`say` failed: {e}") + return f"Error: failed to speak: {text}" + finally: + with self._bg_threads_lock: + self._bg_threads = [ + t for t in self._bg_threads if t is not threading.current_thread() + ] + return f"Spoke: {text}" + + +if __name__ == "__main__": + skill_module = LocalSpeakSkill() + skill_module.start() + print(skill_module.speak("Hello, I am your robot assistant, powered by Gemini.")) + skill_module.stop() diff --git a/dimos/agents_deprecated/memory/image_embedding.py b/dimos/agents_deprecated/memory/image_embedding.py index 373f8c5663..a5086228bf 100644 --- a/dimos/agents_deprecated/memory/image_embedding.py +++ b/dimos/agents_deprecated/memory/image_embedding.py @@ -45,12 +45,18 @@ class ImageEmbeddingProvider: that can be stored in a vector database and used for similarity search. """ + # Remote multimodal embedding model used when model_name == "gemini". + GEMINI_EMBED_MODEL = "gemini-embedding-2" + def __init__(self, model_name: str = "clip", dimensions: int = 512) -> None: """ Initialize the image embedding provider. Args: - model_name: Name of the embedding model to use ("clip", "resnet"). + model_name: Name of the embedding model to use ("clip", "resnet", + "gemini"). "gemini" uses the remote Gemini Embedding API + (multimodal: image + text in one space) and needs + GEMINI_API_KEY or GOOGLE_API_KEY. dimensions: Dimensions of the embedding vectors """ self.model_name = model_name @@ -65,6 +71,23 @@ def __init__(self, model_name: str = "clip", dimensions: int = 512) -> None: def _initialize_model(self): # type: ignore[no-untyped-def] """Initialize the specified embedding model.""" + if self.model_name == "gemini": + from google import genai + + api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") + if not api_key: + raise ValueError( + "Gemini embeddings require GEMINI_API_KEY or GOOGLE_API_KEY to be set" + ) + # Client doubles as the "model"; sentinel processor keeps the + # None-guards in get_embedding / get_text_embedding happy. + self.model = genai.Client(api_key=api_key) # type: ignore[assignment] + self.processor = "gemini" # type: ignore[assignment] + logger.info( + f"Using remote Gemini embeddings: {self.GEMINI_EMBED_MODEL} " + f"({self.dimensions}d)" + ) + return try: import onnxruntime as ort # type: ignore[import-untyped] import torch # noqa: F401 @@ -124,6 +147,18 @@ def get_embedding(self, image: np.ndarray | str | bytes) -> np.ndarray: pil_image = self._prepare_image(image) + if self.model_name == "gemini": + try: + from google.genai import types + + buf = io.BytesIO() + pil_image.convert("RGB").save(buf, format="PNG") + part = types.Part.from_bytes(data=buf.getvalue(), mime_type="image/png") + return self._gemini_embed_content([part]) + except Exception as e: + logger.error(f"Error generating Gemini image embedding: {e}") + return np.random.randn(self.dimensions).astype(np.float32) + embedding: np.ndarray try: import torch @@ -197,6 +232,13 @@ def get_text_embedding(self, text: str) -> np.ndarray: logger.error("Model not initialized. Using fallback random embedding.") return np.random.randn(self.dimensions).astype(np.float32) + if self.model_name == "gemini": + try: + return self._gemini_embed_content(text) + except Exception as e: + logger.error(f"Error generating Gemini text embedding: {e}") + return np.random.randn(self.dimensions).astype(np.float32) + if self.model_name != "clip": logger.warning( f"Text embeddings are only supported with CLIP model, not {self.model_name}. Using random embedding." @@ -251,6 +293,24 @@ def get_text_embedding(self, text: str) -> np.ndarray: logger.error(f"Error generating text embedding: {e}") return np.random.randn(self.dimensions).astype(np.float32) + def _gemini_embed_content(self, contents: object) -> np.ndarray: + """Call the Gemini multimodal embedding API and return a normalized vector. + + `contents` may be a text string or a list of `types.Part` (e.g. an image). + Image and text land in the same space, so semantic text search over + stored frames keeps working. + """ + from google.genai import types + + cfg = types.EmbedContentConfig(output_dimensionality=self.dimensions) + result = self.model.models.embed_content( # type: ignore[union-attr] + model=self.GEMINI_EMBED_MODEL, contents=contents, config=cfg + ) + vec = np.array(result.embeddings[0].values, dtype=np.float32) + # Truncated (<3072d) Gemini embeddings are not pre-normalized. + norm = np.linalg.norm(vec) + return vec / norm if norm > 0 else vec + def _prepare_image(self, image: np.ndarray | str | bytes) -> Image.Image: """ Convert the input image to PIL format required by the models. diff --git a/dimos/memory2/go2_full_recorder.py b/dimos/memory2/go2_full_recorder.py new file mode 100644 index 0000000000..d0f78e9f90 --- /dev/null +++ b/dimos/memory2/go2_full_recorder.py @@ -0,0 +1,76 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A broad Recorder that captures the full Go2 sim stream to one SQLite file. + +Opt-in per run by appending the module name, e.g.:: + + dimos --simulation mujoco run unitree-go2-agentic-gemini go2-full-recorder + +Every declared ``In`` port is subscribed and persisted with the robot pose + +timestamp; the resulting DB is replayable (``dimos --replay --replay-db ...``) +and readable via ``SqliteStore(db).stream(name).iterate()``. +""" + +from pathlib import Path + +from langchain_core.messages.base import BaseMessage + +from dimos.core.core import rpc +from dimos.core.stream import In +from dimos.memory2.module import Recorder, RecorderConfig +from dimos.msgs.geometry_msgs.PointStamped import PointStamped +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Twist import Twist +from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid +from dimos.msgs.nav_msgs.Path import Path as NavPath +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 +from dimos_lcm.std_msgs.Bool import Bool + + +class Go2FullRecorderConfig(RecorderConfig): + db_path: str | Path = "assets/output/captures/go2_full.db" + + +class Go2FullRecorder(Recorder): + """Records the full robot story: vision, 3D/maps, nav state, voice + agent.""" + + # Vision + color_image: In[Image] + camera_info: In[CameraInfo] + # 3D / maps + lidar: In[PointCloud2] + global_map: In[PointCloud2] + global_costmap: In[OccupancyGrid] + # State / navigation + odom: In[PoseStamped] + path: In[NavPath] + goal: In[PointStamped] + way_point: In[PointStamped] + cmd_vel: In[Twist] + goal_reached: In[Bool] + # Voice + agent + human_input: In[str] + agent: In[BaseMessage] + + config: Go2FullRecorderConfig + + @rpc + def start(self) -> None: + # super().start() opens the SQLite store at db_path, which fails if the + # parent directory doesn't exist yet — create it first. + Path(self.config.db_path).parent.mkdir(parents=True, exist_ok=True) + super().start() diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 3d101cca79..8d8298d363 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -94,6 +94,7 @@ "unitree-g1-sim": "dimos.robot.unitree.g1.blueprints.perceptive.unitree_g1_sim:unitree_g1_sim", "unitree-go2": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2:unitree_go2", "unitree-go2-agentic": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic:unitree_go2_agentic", + "unitree-go2-agentic-gemini": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_gemini:unitree_go2_agentic_gemini", "unitree-go2-agentic-huggingface": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_huggingface:unitree_go2_agentic_huggingface", "unitree-go2-agentic-ollama": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_ollama:unitree_go2_agentic_ollama", "unitree-go2-basic": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_basic:unitree_go2_basic", @@ -149,8 +150,10 @@ "g1-high-level-web-rtc": "dimos.robot.unitree.g1.effectors.high_level.webrtc.G1HighLevelWebRtc", "g1-sim-connection": "dimos.robot.unitree.g1.mujoco_sim.G1SimConnection", "g1-whole-body-connection": "dimos.robot.unitree.g1.wholebody_connection.G1WholeBodyConnection", + "gemini-speak-skill": "dimos.agents.skills.gemini_speak_skill.GeminiSpeakSkill", "go2-connection": "dimos.robot.unitree.go2.connection.GO2Connection", "go2-fleet-connection": "dimos.robot.unitree.go2.fleet_connection.Go2FleetConnection", + "go2-full-recorder": "dimos.memory2.go2_full_recorder.Go2FullRecorder", "go2-memory": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2.Go2Memory", "google-maps-skill-container": "dimos.agents.skills.google_maps_skill_container.GoogleMapsSkillContainer", "gps-nav-skill-container": "dimos.agents.skills.gps_nav_skill.GpsNavSkillContainer", @@ -162,6 +165,7 @@ "keyboard-teleop": "dimos.robot.unitree.keyboard_teleop.KeyboardTeleop", "keyboard-teleop-module": "dimos.teleop.keyboard.keyboard_teleop_module.KeyboardTeleopModule", "local-planner": "dimos.navigation.nav_stack.modules.local_planner.local_planner.LocalPlanner", + "local-speak-skill": "dimos.agents.skills.local_speak_skill.LocalSpeakSkill", "manipulation-module": "dimos.manipulation.manipulation_module.ManipulationModule", "map": "dimos.robot.unitree.type.map.Map", "marker-tf-module": "dimos.perception.fiducial.marker_tf_module.MarkerTfModule", diff --git a/dimos/robot/test_all_blueprints_generation.py b/dimos/robot/test_all_blueprints_generation.py index d8b0081d7f..d141f4e3ed 100644 --- a/dimos/robot/test_all_blueprints_generation.py +++ b/dimos/robot/test_all_blueprints_generation.py @@ -32,7 +32,14 @@ "dimos/core/blueprints.py", "dimos/core/test_blueprints.py", } -BLUEPRINT_METHODS = {"transports", "global_config", "remappings", "requirements", "configurators"} +BLUEPRINT_METHODS = { + "transports", + "global_config", + "remappings", + "requirements", + "configurators", + "disabled_modules", +} _EXCLUDED_MODULE_NAMES = {"Module", "ModuleBase", "StreamModule"} diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py new file mode 100644 index 0000000000..49f0d2bcb2 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.agents.skills.gemini_speak_skill import GeminiSpeakSkill +from dimos.agents.skills.speak_skill import SpeakSkill +from dimos.core.coordination.blueprints import autoconnect +from dimos.experimental.security_demo.security_module import SecurityModule +from dimos.perception.spatial_perception import SpatialMemory +from dimos.robot.unitree.go2.blueprints.agentic._common_agentic import _common_agentic +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial import unitree_go2_spatial + +# Disabled for a no-OpenAI / non-CUDA setup: +# - SecurityModule needs a CUDA GPU (EdgeTAM) -> won't boot on Apple Silicon. +# - SpeakSkill (TTS) hardcodes OpenAI -> needs OPENAI_API_KEY; replaced below +# by GeminiSpeakSkill, which uses the Gemini TTS API (reuses GOOGLE_API_KEY, +# no GPU, cross-platform so it can also run on the robot). +# SpatialMemory is re-declared to use remote Gemini embeddings instead of the +# local CLIP model (which fails on Apple CoreML); the later atom overrides the +# one inside unitree_go2_spatial. +unitree_go2_agentic_gemini = autoconnect( + unitree_go2_spatial, + SpatialMemory.blueprint(embedding_model="gemini", embedding_dimensions=768), + McpServer.blueprint(), + McpClient.blueprint(model="google_genai:gemini-2.5-flash"), + _common_agentic, + GeminiSpeakSkill.blueprint(), +).disabled_modules(SecurityModule, SpeakSkill) + +__all__ = ["unitree_go2_agentic_gemini"] diff --git a/dimos/stream/audio/tts/node_gemini.py b/dimos/stream/audio/tts/node_gemini.py new file mode 100644 index 0000000000..78d4194d08 --- /dev/null +++ b/dimos/stream/audio/tts/node_gemini.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import threading +import time + +from google import genai +from google.genai import types +import numpy as np +from reactivex import Observable, Subject + +from dimos.stream.audio.base import ( + AbstractAudioEmitter, + AudioEvent, +) +from dimos.stream.audio.text.base import AbstractTextConsumer, AbstractTextEmitter +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + +# Gemini TTS returns single-channel 16-bit PCM at 24 kHz, little-endian. +_SAMPLE_RATE = 24000 + + +class GeminiTTSNode(AbstractTextConsumer, AbstractAudioEmitter, AbstractTextEmitter): + """ + A text-to-speech node that consumes text, emits audio using Google's Gemini TTS API, and passes through text. + + This node implements AbstractTextConsumer to receive text input, AbstractAudioEmitter + to provide audio output, and AbstractTextEmitter to pass through the text being spoken, + allowing it to be inserted into a text-to-audio pipeline with text passthrough capabilities. + + Mirrors OpenAITTSNode but uses the Gemini API directly (google-genai), so it + needs no OpenAI key and reuses the GOOGLE_API_KEY already used for the LLM + and embeddings. Only the ``*-preview-tts`` models support audio output. + """ + + def __init__( + self, + api_key: str | None = None, + voice: str = "Kore", + model: str = "gemini-2.5-flash-preview-tts", + instruction: str = "Read the following text aloud verbatim", + ) -> None: + """ + Initialize GeminiTTSNode. + + Args: + api_key: Gemini API key (if None, reads GOOGLE_API_KEY / GEMINI_API_KEY) + voice: Prebuilt Gemini voice name (e.g. "Kore", "Puck", "Charon") + model: Gemini TTS model (must be a ``*-preview-tts`` model) + instruction: Style directive prepended to each utterance. Gemini + narrates only the text after it (not the directive itself); it + also stops the model from "answering" short prompts instead of + reading them. Set to "" to send the raw text unmodified. + """ + self.voice = voice + self.model = model + self.instruction = instruction + + api_key = api_key or os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY") + if not api_key: + raise ValueError("Gemini TTS requires GEMINI_API_KEY or GOOGLE_API_KEY to be set") + self.client = genai.Client(api_key=api_key) + + # Initialize state + self.audio_subject = Subject() # type: ignore[var-annotated] + self.text_subject = Subject() # type: ignore[var-annotated] + self.subscription = None + self.processing_thread = None + self.is_running = True + self.text_queue = [] # type: ignore[var-annotated] + self.queue_lock = threading.Lock() + + def emit_audio(self) -> Observable: # type: ignore[type-arg] + """ + Returns an observable that emits audio frames. + + Returns: + Observable emitting AudioEvent objects + """ + return self.audio_subject + + def emit_text(self) -> Observable: # type: ignore[type-arg] + """ + Returns an observable that emits the text being spoken. + + Returns: + Observable emitting text strings + """ + return self.text_subject + + def consume_text(self, text_observable: Observable) -> "AbstractTextConsumer": # type: ignore[type-arg] + """ + Start consuming text from the observable source. + + Args: + text_observable: Observable source of text strings + + Returns: + Self for method chaining + """ + logger.info("Starting GeminiTTSNode") + + # Start the processing thread + self.processing_thread = threading.Thread(target=self._process_queue, daemon=True) # type: ignore[assignment] + self.processing_thread.start() # type: ignore[attr-defined] + + # Subscribe to the text observable + self.subscription = text_observable.subscribe( # type: ignore[assignment] + on_next=self._queue_text, + on_error=lambda e: logger.error(f"Error in GeminiTTSNode: {e}"), + ) + + return self + + def _queue_text(self, text: str) -> None: + """ + Add text to the processing queue. + + Args: + text: The text to synthesize + """ + if not text.strip(): + return + + with self.queue_lock: + self.text_queue.append(text) + + def _process_queue(self) -> None: + """Background thread to process the text queue.""" + while self.is_running: + # Check if there's text to process + text_to_process = None + with self.queue_lock: + if self.text_queue: + text_to_process = self.text_queue.pop(0) + + if text_to_process: + self._synthesize_speech(text_to_process) + else: + # Sleep a bit to avoid busy-waiting + time.sleep(0.1) + + def _synthesize_speech(self, text: str) -> None: + """ + Convert text to speech using the Gemini TTS API. + + Args: + text: The text to synthesize + """ + try: + contents = f"{self.instruction}: {text}" if self.instruction else text + response = self.client.models.generate_content( + model=self.model, + contents=contents, + config=types.GenerateContentConfig( + response_modalities=["AUDIO"], + speech_config=types.SpeechConfig( + voice_config=types.VoiceConfig( + prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self.voice) + ) + ), + ), + ) + self.text_subject.on_next(text) + + # Gemini returns raw 16-bit PCM bytes (24 kHz, mono, little-endian). + pcm_bytes = response.candidates[0].content.parts[0].inline_data.data + audio_array = np.frombuffer(pcm_bytes, dtype=np.int16) + + audio_event = AudioEvent( + data=audio_array, + sample_rate=_SAMPLE_RATE, + timestamp=time.time(), + channels=1, + ) + + self.audio_subject.on_next(audio_event) + + except Exception as e: + logger.error(f"Error synthesizing speech: {e}") + + def dispose(self) -> None: + """Clean up resources.""" + logger.info("Disposing GeminiTTSNode") + + self.is_running = False + + # Clear pending items so the thread doesn't start new synthesis. + with self.queue_lock: + self.text_queue.clear() + + if self.processing_thread and self.processing_thread.is_alive(): + self.processing_thread.join(timeout=2.0) + + if self.subscription: + self.subscription.dispose() + self.subscription = None + + # Complete the subjects + self.audio_subject.on_completed() + self.text_subject.on_completed() + + +if __name__ == "__main__": + from dimos.stream.audio.node_output import SounddeviceAudioOutput + from dimos.stream.audio.text.node_stdout import TextPrinterNode + from dimos.stream.audio.utils import keepalive + + # Create a simple text subject that we can push values to + text_subject = Subject() # type: ignore[var-annotated] + + tts_node = GeminiTTSNode(voice="Kore") + tts_node.consume_text(text_subject) + + # Create and connect an audio output node - match Gemini's 24 kHz output + audio_output = SounddeviceAudioOutput(sample_rate=_SAMPLE_RATE) + audio_output.consume_audio(tts_node.emit_audio()) + + stdout = TextPrinterNode(prefix="[Spoken Text] ") + stdout.consume_text(tts_node.emit_text()) + + test_messages = [ + "Hello!", + "This is a test of the Gemini text to speech system.", + ] + + print("Starting Gemini TTS test...") + print("-" * 60) + + for message in test_messages: + text_subject.on_next(message) + + keepalive() diff --git a/pyproject.toml b/pyproject.toml index 7bc41d2cd3..ef6e6126e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -201,6 +201,8 @@ agents = [ "langchain-text-splitters>=1,<2", "langchain-huggingface>=1,<2", "langchain-ollama>=1,<2", + "langchain-google-genai>=2,<3", + "google-genai>=2,<3", "ollama>=0.6.0", "anthropic>=0.19.0", diff --git a/tools/capture-viewer/README.md b/tools/capture-viewer/README.md new file mode 100644 index 0000000000..26da5badea --- /dev/null +++ b/tools/capture-viewer/README.md @@ -0,0 +1,29 @@ +# Capture Viewer (demo) + +A tiny Bun server that browses a `go2-full-recorder` SQLite capture in the +browser — reads the DB directly (no Python). Demo-grade and disposable. + +## 1. Record a capture + +```bash +dimos --simulation mujoco run unitree-go2-agentic-gemini go2-full-recorder +# drive the dog (dashboard / `dimos agent-send "..."`), then Ctrl-C +``` +→ `assets/output/captures/go2_full.db` + +## 2. View it + +```bash +bun run tools/capture-viewer/server.ts # default db path +# or: bun run tools/capture-viewer/server.ts /path/to/go2_full.db +# or: CAPTURE_DB=/path/to.db PORT=3000 bun run tools/capture-viewer/server.ts +``` +Open http://localhost:3000 — scrub the timeline to play camera frames; the map +shows the dog's trajectory (from pose columns) with a live heading marker; the +table lists every captured stream + row count. + +## Notes +- Run the viewer **after** the capture process stops (clean WAL checkpoint). +- Renders camera frames (JPEG extracted from the LCM blob) + trajectory. + `lidar`/`global_costmap` (LCM) and `agent` (pickle) show as counts only — their + blobs need Python decoders (a job for the real Next.js app later). diff --git a/tools/capture-viewer/server.ts b/tools/capture-viewer/server.ts new file mode 100644 index 0000000000..168183bbfb --- /dev/null +++ b/tools/capture-viewer/server.ts @@ -0,0 +1,281 @@ +// Capture viewer — a tiny Bun server that browses a go2-full-recorder SQLite +// capture in the browser. Reads the DB directly (no Python), serves camera +// frames as JPEG and the robot trajectory from pose columns. +// +// bun run tools/capture-viewer/server.ts [db_path] +// CAPTURE_DB=/path/to.db bun run tools/capture-viewer/server.ts +// +// Demo-grade and disposable; lidar/costmap/agent are shown as counts only +// (their blobs are LCM/pickle and need Python decoders). + +import { Database } from "bun:sqlite"; +import { existsSync } from "node:fs"; + +const DB_PATH = + process.argv[2] ?? process.env.CAPTURE_DB ?? "assets/output/captures/go2_full.db"; +const PORT = Number(process.env.PORT ?? 3000); + +// Open read-only per request: the DB may be created after the server starts, +// and a fresh handle avoids stale WAL reads. Returns null if the file is absent. +function openDb(): Database | null { + if (!existsSync(DB_PATH)) return null; + return new Database(DB_PATH, { readonly: true }); +} + +// Names of the real streams in this DB. Used both for the UI and to validate +// any identifier we interpolate into SQL (table names can't be bound). +function streamNames(db: Database): string[] { + try { + return db + .query("SELECT name FROM _streams ORDER BY name") + .all() + .map((r: any) => r.name as string); + } catch { + // Fallback for DBs without the _streams catalog: derive from "_blob". + return db + .query( + "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%\\_blob' ESCAPE '\\'", + ) + .all() + .map((r: any) => (r.name as string).replace(/_blob$/, "")); + } +} + +// Extract the JPEG payload from an LCM-wrapped Image blob by scanning for the +// JPEG SOI (FF D8) and the last EOI (FF D9). The payload is a contiguous byte +// field inside the LCM struct, so this is reliable here. +function extractJpeg(buf: Uint8Array): Uint8Array | null { + let soi = -1; + for (let i = 0; i + 1 < buf.length; i++) { + if (buf[i] === 0xff && buf[i + 1] === 0xd8) { + soi = i; + break; + } + } + if (soi < 0) return null; + let eoi = -1; + for (let i = buf.length - 2; i > soi; i--) { + if (buf[i] === 0xff && buf[i + 1] === 0xd9) { + eoi = i + 2; + break; + } + } + if (eoi < 0) return null; + return buf.subarray(soi, eoi); +} + +const json = (data: unknown, status = 200) => Response.json(data, { status }); +const noDb = () => + json({ error: `No capture DB at ${DB_PATH}. Record one first.` }, 503); + +Bun.serve({ + port: PORT, + routes: { + "/": () => new Response(HTML, { headers: { "Content-Type": "text/html" } }), + + // [{ name, count }] for every stream, plus dbPath for the UI header. + "/api/streams": () => { + const db = openDb(); + if (!db) return noDb(); + try { + const names = streamNames(db); + const out = names.map((name) => { + let count = 0; + try { + const row = db.query(`SELECT COUNT(*) AS c FROM "${name}"`).get() as any; + count = row?.c ?? 0; + } catch { + /* listed in _streams but no table yet */ + } + return { name, count }; + }); + return json({ dbPath: DB_PATH, streams: out }); + } finally { + db.close(); + } + }, + + // Camera frame index for the scrubber (ordered by time). + "/api/frames": () => { + const db = openDb(); + if (!db) return noDb(); + try { + if (!streamNames(db).includes("color_image")) return json([]); + const rows = db + .query( + `SELECT id, ts, pose_x, pose_y, pose_qz, pose_qw + FROM "color_image" ORDER BY ts ASC`, + ) + .all(); + return json(rows); + } finally { + db.close(); + } + }, + + // Dense path for the map: prefer odom, fall back to color_image poses. + "/api/trajectory": () => { + const db = openDb(); + if (!db) return noDb(); + try { + const names = streamNames(db); + const src = names.includes("odom") + ? "odom" + : names.includes("color_image") + ? "color_image" + : null; + if (!src) return json([]); + const rows = db + .query( + `SELECT ts, pose_x, pose_y FROM "${src}" + WHERE pose_x IS NOT NULL ORDER BY ts ASC`, + ) + .all(); + return json(rows); + } finally { + db.close(); + } + }, + + // A single camera frame as image/jpeg. + "/api/frame/:id": (req) => { + const db = openDb(); + if (!db) return noDb(); + try { + const row = db + .query(`SELECT data FROM "color_image_blob" WHERE id = $id`) + .get({ $id: Number(req.params.id) }) as any; + if (!row?.data) return new Response("not found", { status: 404 }); + const jpeg = extractJpeg(row.data as Uint8Array); + if (!jpeg) return new Response("no jpeg", { status: 404 }); + return new Response(jpeg, { headers: { "Content-Type": "image/jpeg" } }); + } finally { + db.close(); + } + }, + }, +}); + +console.log(`capture-viewer: http://localhost:${PORT} (db: ${DB_PATH})`); + +const HTML = /* html */ ` +Go2 Capture Viewer + + +
+

🐕 Go2 Capture Viewer

+ loading… +
+
+
+ camera frame +
+ + + 0.0s +
+
+
+ +
streamcount
+
+
+ +`; From d60d1639e724c89e234e5ced912fd1269e112af0 Mon Sep 17 00:00:00 2001 From: grmkris Date: Tue, 26 May 2026 23:27:30 +0800 Subject: [PATCH 05/29] chore: use stock go2-memory recorder, drop Go2FullRecorder Replace the custom Go2FullRecorder with the stock go2-memory recorder for the capture demo (camera + lidar + odom is enough for the frames+trajectory viewer). Point the capture-viewer at recording_go2.db and ignore recording sidecars + the MuJoCo runtime log. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 5 ++ dimos/memory2/go2_full_recorder.py | 76 ------------------------------ dimos/robot/all_blueprints.py | 1 - tools/capture-viewer/README.md | 21 +++++---- tools/capture-viewer/server.ts | 3 +- 5 files changed, 18 insertions(+), 88 deletions(-) delete mode 100644 dimos/memory2/go2_full_recorder.py diff --git a/.gitignore b/.gitignore index aedee04af7..a3c0a05a4a 100644 --- a/.gitignore +++ b/.gitignore @@ -86,3 +86,8 @@ htmlcov/ # Memory2 autorecord recording*.db +recording*.db-wal +recording*.db-shm + +# MuJoCo runtime log +MUJOCO_LOG.TXT diff --git a/dimos/memory2/go2_full_recorder.py b/dimos/memory2/go2_full_recorder.py deleted file mode 100644 index d0f78e9f90..0000000000 --- a/dimos/memory2/go2_full_recorder.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""A broad Recorder that captures the full Go2 sim stream to one SQLite file. - -Opt-in per run by appending the module name, e.g.:: - - dimos --simulation mujoco run unitree-go2-agentic-gemini go2-full-recorder - -Every declared ``In`` port is subscribed and persisted with the robot pose + -timestamp; the resulting DB is replayable (``dimos --replay --replay-db ...``) -and readable via ``SqliteStore(db).stream(name).iterate()``. -""" - -from pathlib import Path - -from langchain_core.messages.base import BaseMessage - -from dimos.core.core import rpc -from dimos.core.stream import In -from dimos.memory2.module import Recorder, RecorderConfig -from dimos.msgs.geometry_msgs.PointStamped import PointStamped -from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped -from dimos.msgs.geometry_msgs.Twist import Twist -from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid -from dimos.msgs.nav_msgs.Path import Path as NavPath -from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo -from dimos.msgs.sensor_msgs.Image import Image -from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 -from dimos_lcm.std_msgs.Bool import Bool - - -class Go2FullRecorderConfig(RecorderConfig): - db_path: str | Path = "assets/output/captures/go2_full.db" - - -class Go2FullRecorder(Recorder): - """Records the full robot story: vision, 3D/maps, nav state, voice + agent.""" - - # Vision - color_image: In[Image] - camera_info: In[CameraInfo] - # 3D / maps - lidar: In[PointCloud2] - global_map: In[PointCloud2] - global_costmap: In[OccupancyGrid] - # State / navigation - odom: In[PoseStamped] - path: In[NavPath] - goal: In[PointStamped] - way_point: In[PointStamped] - cmd_vel: In[Twist] - goal_reached: In[Bool] - # Voice + agent - human_input: In[str] - agent: In[BaseMessage] - - config: Go2FullRecorderConfig - - @rpc - def start(self) -> None: - # super().start() opens the SQLite store at db_path, which fails if the - # parent directory doesn't exist yet — create it first. - Path(self.config.db_path).parent.mkdir(parents=True, exist_ok=True) - super().start() diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 8d8298d363..bc454a0cbe 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -153,7 +153,6 @@ "gemini-speak-skill": "dimos.agents.skills.gemini_speak_skill.GeminiSpeakSkill", "go2-connection": "dimos.robot.unitree.go2.connection.GO2Connection", "go2-fleet-connection": "dimos.robot.unitree.go2.fleet_connection.Go2FleetConnection", - "go2-full-recorder": "dimos.memory2.go2_full_recorder.Go2FullRecorder", "go2-memory": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2.Go2Memory", "google-maps-skill-container": "dimos.agents.skills.google_maps_skill_container.GoogleMapsSkillContainer", "gps-nav-skill-container": "dimos.agents.skills.gps_nav_skill.GpsNavSkillContainer", diff --git a/tools/capture-viewer/README.md b/tools/capture-viewer/README.md index 26da5badea..6ff410a11d 100644 --- a/tools/capture-viewer/README.md +++ b/tools/capture-viewer/README.md @@ -1,29 +1,32 @@ # Capture Viewer (demo) -A tiny Bun server that browses a `go2-full-recorder` SQLite capture in the -browser — reads the DB directly (no Python). Demo-grade and disposable. +A tiny Bun server that browses a DimOS recording (SQLite) in the browser — reads +the DB directly (no Python). Demo-grade and disposable. ## 1. Record a capture +Append the stock `go2-memory` recorder to the run (records `color_image`, +`lidar`, `odom`): + ```bash -dimos --simulation mujoco run unitree-go2-agentic-gemini go2-full-recorder +dimos --simulation mujoco run unitree-go2-agentic-gemini go2-memory # drive the dog (dashboard / `dimos agent-send "..."`), then Ctrl-C ``` -→ `assets/output/captures/go2_full.db` +→ `recording_go2.db` (in the directory you ran from) ## 2. View it ```bash -bun run tools/capture-viewer/server.ts # default db path -# or: bun run tools/capture-viewer/server.ts /path/to/go2_full.db +bun run tools/capture-viewer/server.ts # default: recording_go2.db +# or: bun run tools/capture-viewer/server.ts /path/to/recording.db # or: CAPTURE_DB=/path/to.db PORT=3000 bun run tools/capture-viewer/server.ts ``` Open http://localhost:3000 — scrub the timeline to play camera frames; the map shows the dog's trajectory (from pose columns) with a live heading marker; the -table lists every captured stream + row count. +table lists every recorded stream + row count. ## Notes - Run the viewer **after** the capture process stops (clean WAL checkpoint). - Renders camera frames (JPEG extracted from the LCM blob) + trajectory. - `lidar`/`global_costmap` (LCM) and `agent` (pickle) show as counts only — their - blobs need Python decoders (a job for the real Next.js app later). + `lidar` shows as a count only — its blob is LCM-encoded and needs a Python + decoder (a job for the real app later). diff --git a/tools/capture-viewer/server.ts b/tools/capture-viewer/server.ts index 168183bbfb..c6c9e680f4 100644 --- a/tools/capture-viewer/server.ts +++ b/tools/capture-viewer/server.ts @@ -11,8 +11,7 @@ import { Database } from "bun:sqlite"; import { existsSync } from "node:fs"; -const DB_PATH = - process.argv[2] ?? process.env.CAPTURE_DB ?? "assets/output/captures/go2_full.db"; +const DB_PATH = process.argv[2] ?? process.env.CAPTURE_DB ?? "recording_go2.db"; const PORT = Number(process.env.PORT ?? 3000); // Open read-only per request: the DB may be created after the server starts, From faea9bd0ed9ec87e6428f0d04701a5ac7db775ab Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 03:38:55 +0800 Subject: [PATCH 06/29] =?UTF-8?q?feat(go2):=20take=5Fpicture=20skill=20?= =?UTF-8?q?=E2=80=94=20capture=20frame,=20upload=20to=20robomoo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New TakePictureSkill subscribes to color_image, caches the latest frame, and on take_picture() JPEG-encodes it and POSTs to robomoo's /api/robot/frame with a shared bearer token (ROBOMOO_URL / ROBOT_INGEST_TOKEN from env). Wired into unitree_go2_agentic_gemini and registered as take-picture-skill so the agent can call it ("take a picture"). Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/take_picture_skill.py | 97 ++++++ dimos/robot/all_blueprints.py | 1 + .../agentic/unitree_go2_agentic_gemini.py | 11 + tools/capture-viewer/README.md | 32 -- tools/capture-viewer/server.ts | 280 ------------------ 5 files changed, 109 insertions(+), 312 deletions(-) create mode 100644 dimos/agents/skills/take_picture_skill.py delete mode 100644 tools/capture-viewer/README.md delete mode 100644 tools/capture-viewer/server.ts diff --git a/dimos/agents/skills/take_picture_skill.py b/dimos/agents/skills/take_picture_skill.py new file mode 100644 index 0000000000..0f3d351a57 --- /dev/null +++ b/dimos/agents/skills/take_picture_skill.py @@ -0,0 +1,97 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Capture the robot's current camera frame and upload it to the robomoo app. + +Exposes a `take_picture` skill the agent calls (e.g. "take a picture"); the +latest `color_image` frame is JPEG-encoded and POSTed to robomoo's +`/api/robot/frame` (shared-secret bearer token). Configure via env: + + ROBOMOO_URL=https://gateway-...up.railway.app + ROBOT_INGEST_TOKEN= +""" + +import os + +import cv2 +import httpx + +from dimos.agents.annotation import skill +from dimos.agents.skill_result import SkillResult +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In +from dimos.msgs.sensor_msgs.Image import Image +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class TakePictureSkillConfig(ModuleConfig): + robomoo_url: str = os.getenv("ROBOMOO_URL", "") + ingest_token: str = os.getenv("ROBOT_INGEST_TOKEN", "") + + +class TakePictureSkill(Module): + config: TakePictureSkillConfig + color_image: In[Image] + + @rpc + def start(self) -> None: + super().start() + self._latest: Image | None = None + self.color_image.subscribe(self._on_image) + + def _on_image(self, image: Image) -> None: + self._latest = image + + @skill + def take_picture(self, note: str = "") -> SkillResult: + """Capture a photo from the robot's camera and upload it. + + Use whenever the user asks the robot to take or capture a picture or + photo of what it currently sees. `note` is an optional short caption + to tag the image with (e.g. "kitchen", "plant"). + """ + frame = getattr(self, "_latest", None) + if frame is None: + return SkillResult.fail("NO_FRAME", "No camera frame received yet") + + url = self.config.robomoo_url + token = self.config.ingest_token + if not url or not token: + return SkillResult.fail( + "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" + ) + + ok, buf = cv2.imencode(".jpg", frame.data) + if not ok: + return SkillResult.fail("EXECUTION_FAILED", "JPEG encode failed") + + try: + resp = httpx.post( + f"{url.rstrip('/')}/api/robot/frame", + headers={"Authorization": f"Bearer {token}"}, + files={"file": ("frame.jpg", buf.tobytes(), "image/jpeg")}, + data={"note": note}, + timeout=30.0, + ) + resp.raise_for_status() + except Exception as e: # noqa: BLE001 — report any upload failure to the agent + logger.exception("take_picture upload failed") + return SkillResult.fail("EXECUTION_FAILED", f"upload failed: {e}") + + key = resp.json().get("key", "") + logger.info("take_picture uploaded frame key=%s", key) + return SkillResult.ok(f"Picture taken and uploaded ({key})", key=key) diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index bc454a0cbe..a61c0af466 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -208,6 +208,7 @@ "simple-planner": "dimos.navigation.nav_stack.modules.simple_planner.simple_planner.SimplePlanner", "spatial-memory": "dimos.perception.spatial_perception.SpatialMemory", "speak-skill": "dimos.agents.skills.speak_skill.SpeakSkill", + "take-picture-skill": "dimos.agents.skills.take_picture_skill.TakePictureSkill", "tare-planner": "dimos.navigation.nav_stack.modules.tare_planner.tare_planner.TarePlanner", "temporal-memory": "dimos.perception.experimental.temporal_memory.temporal_memory.TemporalMemory", "terrain-analysis": "dimos.navigation.nav_stack.modules.terrain_analysis.terrain_analysis.TerrainAnalysis", diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py index 49f0d2bcb2..1c4e19ff82 100644 --- a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py @@ -16,12 +16,15 @@ from dimos.agents.mcp.mcp_client import McpClient from dimos.agents.mcp.mcp_server import McpServer from dimos.agents.skills.gemini_speak_skill import GeminiSpeakSkill +from dimos.agents.skills.person_follow import PersonFollowSkillContainer from dimos.agents.skills.speak_skill import SpeakSkill +from dimos.agents.skills.take_picture_skill import TakePictureSkill from dimos.core.coordination.blueprints import autoconnect from dimos.experimental.security_demo.security_module import SecurityModule from dimos.perception.spatial_perception import SpatialMemory from dimos.robot.unitree.go2.blueprints.agentic._common_agentic import _common_agentic from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial import unitree_go2_spatial +from dimos.robot.unitree.go2.connection import GO2Connection # Disabled for a no-OpenAI / non-CUDA setup: # - SecurityModule needs a CUDA GPU (EdgeTAM) -> won't boot on Apple Silicon. @@ -31,13 +34,21 @@ # SpatialMemory is re-declared to use remote Gemini embeddings instead of the # local CLIP model (which fails on Apple CoreML); the later atom overrides the # one inside unitree_go2_spatial. +# PersonFollowSkillContainer is likewise re-declared to use Gemini for detection +# (instead of Qwen, which needs ALIBABA_API_KEY); tracking_mode="auto" then +# resolves to periodic re-detection on non-CUDA machines and EdgeTAM on GPU. unitree_go2_agentic_gemini = autoconnect( unitree_go2_spatial, SpatialMemory.blueprint(embedding_model="gemini", embedding_dimensions=768), McpServer.blueprint(), McpClient.blueprint(model="google_genai:gemini-2.5-flash"), _common_agentic, + PersonFollowSkillContainer.blueprint( + camera_info=GO2Connection.camera_info_static, + vl_model_name="gemini", + ), GeminiSpeakSkill.blueprint(), + TakePictureSkill.blueprint(), ).disabled_modules(SecurityModule, SpeakSkill) __all__ = ["unitree_go2_agentic_gemini"] diff --git a/tools/capture-viewer/README.md b/tools/capture-viewer/README.md deleted file mode 100644 index 6ff410a11d..0000000000 --- a/tools/capture-viewer/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Capture Viewer (demo) - -A tiny Bun server that browses a DimOS recording (SQLite) in the browser — reads -the DB directly (no Python). Demo-grade and disposable. - -## 1. Record a capture - -Append the stock `go2-memory` recorder to the run (records `color_image`, -`lidar`, `odom`): - -```bash -dimos --simulation mujoco run unitree-go2-agentic-gemini go2-memory -# drive the dog (dashboard / `dimos agent-send "..."`), then Ctrl-C -``` -→ `recording_go2.db` (in the directory you ran from) - -## 2. View it - -```bash -bun run tools/capture-viewer/server.ts # default: recording_go2.db -# or: bun run tools/capture-viewer/server.ts /path/to/recording.db -# or: CAPTURE_DB=/path/to.db PORT=3000 bun run tools/capture-viewer/server.ts -``` -Open http://localhost:3000 — scrub the timeline to play camera frames; the map -shows the dog's trajectory (from pose columns) with a live heading marker; the -table lists every recorded stream + row count. - -## Notes -- Run the viewer **after** the capture process stops (clean WAL checkpoint). -- Renders camera frames (JPEG extracted from the LCM blob) + trajectory. - `lidar` shows as a count only — its blob is LCM-encoded and needs a Python - decoder (a job for the real app later). diff --git a/tools/capture-viewer/server.ts b/tools/capture-viewer/server.ts deleted file mode 100644 index c6c9e680f4..0000000000 --- a/tools/capture-viewer/server.ts +++ /dev/null @@ -1,280 +0,0 @@ -// Capture viewer — a tiny Bun server that browses a go2-full-recorder SQLite -// capture in the browser. Reads the DB directly (no Python), serves camera -// frames as JPEG and the robot trajectory from pose columns. -// -// bun run tools/capture-viewer/server.ts [db_path] -// CAPTURE_DB=/path/to.db bun run tools/capture-viewer/server.ts -// -// Demo-grade and disposable; lidar/costmap/agent are shown as counts only -// (their blobs are LCM/pickle and need Python decoders). - -import { Database } from "bun:sqlite"; -import { existsSync } from "node:fs"; - -const DB_PATH = process.argv[2] ?? process.env.CAPTURE_DB ?? "recording_go2.db"; -const PORT = Number(process.env.PORT ?? 3000); - -// Open read-only per request: the DB may be created after the server starts, -// and a fresh handle avoids stale WAL reads. Returns null if the file is absent. -function openDb(): Database | null { - if (!existsSync(DB_PATH)) return null; - return new Database(DB_PATH, { readonly: true }); -} - -// Names of the real streams in this DB. Used both for the UI and to validate -// any identifier we interpolate into SQL (table names can't be bound). -function streamNames(db: Database): string[] { - try { - return db - .query("SELECT name FROM _streams ORDER BY name") - .all() - .map((r: any) => r.name as string); - } catch { - // Fallback for DBs without the _streams catalog: derive from "_blob". - return db - .query( - "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%\\_blob' ESCAPE '\\'", - ) - .all() - .map((r: any) => (r.name as string).replace(/_blob$/, "")); - } -} - -// Extract the JPEG payload from an LCM-wrapped Image blob by scanning for the -// JPEG SOI (FF D8) and the last EOI (FF D9). The payload is a contiguous byte -// field inside the LCM struct, so this is reliable here. -function extractJpeg(buf: Uint8Array): Uint8Array | null { - let soi = -1; - for (let i = 0; i + 1 < buf.length; i++) { - if (buf[i] === 0xff && buf[i + 1] === 0xd8) { - soi = i; - break; - } - } - if (soi < 0) return null; - let eoi = -1; - for (let i = buf.length - 2; i > soi; i--) { - if (buf[i] === 0xff && buf[i + 1] === 0xd9) { - eoi = i + 2; - break; - } - } - if (eoi < 0) return null; - return buf.subarray(soi, eoi); -} - -const json = (data: unknown, status = 200) => Response.json(data, { status }); -const noDb = () => - json({ error: `No capture DB at ${DB_PATH}. Record one first.` }, 503); - -Bun.serve({ - port: PORT, - routes: { - "/": () => new Response(HTML, { headers: { "Content-Type": "text/html" } }), - - // [{ name, count }] for every stream, plus dbPath for the UI header. - "/api/streams": () => { - const db = openDb(); - if (!db) return noDb(); - try { - const names = streamNames(db); - const out = names.map((name) => { - let count = 0; - try { - const row = db.query(`SELECT COUNT(*) AS c FROM "${name}"`).get() as any; - count = row?.c ?? 0; - } catch { - /* listed in _streams but no table yet */ - } - return { name, count }; - }); - return json({ dbPath: DB_PATH, streams: out }); - } finally { - db.close(); - } - }, - - // Camera frame index for the scrubber (ordered by time). - "/api/frames": () => { - const db = openDb(); - if (!db) return noDb(); - try { - if (!streamNames(db).includes("color_image")) return json([]); - const rows = db - .query( - `SELECT id, ts, pose_x, pose_y, pose_qz, pose_qw - FROM "color_image" ORDER BY ts ASC`, - ) - .all(); - return json(rows); - } finally { - db.close(); - } - }, - - // Dense path for the map: prefer odom, fall back to color_image poses. - "/api/trajectory": () => { - const db = openDb(); - if (!db) return noDb(); - try { - const names = streamNames(db); - const src = names.includes("odom") - ? "odom" - : names.includes("color_image") - ? "color_image" - : null; - if (!src) return json([]); - const rows = db - .query( - `SELECT ts, pose_x, pose_y FROM "${src}" - WHERE pose_x IS NOT NULL ORDER BY ts ASC`, - ) - .all(); - return json(rows); - } finally { - db.close(); - } - }, - - // A single camera frame as image/jpeg. - "/api/frame/:id": (req) => { - const db = openDb(); - if (!db) return noDb(); - try { - const row = db - .query(`SELECT data FROM "color_image_blob" WHERE id = $id`) - .get({ $id: Number(req.params.id) }) as any; - if (!row?.data) return new Response("not found", { status: 404 }); - const jpeg = extractJpeg(row.data as Uint8Array); - if (!jpeg) return new Response("no jpeg", { status: 404 }); - return new Response(jpeg, { headers: { "Content-Type": "image/jpeg" } }); - } finally { - db.close(); - } - }, - }, -}); - -console.log(`capture-viewer: http://localhost:${PORT} (db: ${DB_PATH})`); - -const HTML = /* html */ ` -Go2 Capture Viewer - - -
-

🐕 Go2 Capture Viewer

- loading… -
-
-
- camera frame -
- - - 0.0s -
-
-
- -
streamcount
-
-
- -`; From 05d2175da79595e5aacb3fa914e0f151e9909bfe Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 03:48:03 +0800 Subject: [PATCH 07/29] =?UTF-8?q?feat(go2):=20pose=20on=20captures=20+=20g?= =?UTF-8?q?lobal=5Fcostmap=20=E2=86=92=20robomoo=20map=20uploader?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit take_picture now attaches the robot's odom pose (poseX/poseY) + label so the web can pin captures on the map. New MapUploader subscribes global_costmap, renders it with turbo_image, and POSTs the PNG + grid metadata to robomoo /api/robot/map (throttled). Both wired into unitree_go2_agentic_gemini. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/map_uploader.py | 87 +++++++++++++++++++ dimos/agents/skills/take_picture_skill.py | 18 +++- dimos/robot/all_blueprints.py | 1 + .../agentic/unitree_go2_agentic_gemini.py | 2 + 4 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 dimos/agents/skills/map_uploader.py diff --git a/dimos/agents/skills/map_uploader.py b/dimos/agents/skills/map_uploader.py new file mode 100644 index 0000000000..945fef928b --- /dev/null +++ b/dimos/agents/skills/map_uploader.py @@ -0,0 +1,87 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Periodically upload the 2D occupancy map (global_costmap) to the robomoo app. + +Subscribes to `global_costmap`, renders it to a PNG with the existing +`turbo_image` colormap (same look as the command-center), throttles, and POSTs +it plus grid metadata (resolution, origin, width, height) to robomoo's +`/api/robot/map`. The web app overlays capture pins on it via +`col = (x - originX) / resolution`, `row = (y - originY) / resolution`. + +Env: ROBOMOO_URL, ROBOT_INGEST_TOKEN. +""" + +import os +import time + +import cv2 +import httpx + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In +from dimos.mapping.occupancy.visualizations import turbo_image +from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class MapUploaderConfig(ModuleConfig): + robomoo_url: str = os.getenv("ROBOMOO_URL", "") + ingest_token: str = os.getenv("ROBOT_INGEST_TOKEN", "") + min_period_s: float = 5.0 # throttle: at most one upload every N seconds + + +class MapUploader(Module): + config: MapUploaderConfig + global_costmap: In[OccupancyGrid] + + @rpc + def start(self) -> None: + super().start() + self._last = 0.0 + self.global_costmap.subscribe(self._on_costmap) + + def _on_costmap(self, grid: OccupancyGrid) -> None: + now = time.monotonic() + if now - self._last < self.config.min_period_s: + return + url = self.config.robomoo_url + token = self.config.ingest_token + if not url or not token: + return + self._last = now + + try: + bgr = turbo_image(grid.grid) # (H, W, 3) uint8 + ok, buf = cv2.imencode(".png", bgr) + if not ok: + return + httpx.post( + f"{url.rstrip('/')}/api/robot/map", + headers={"Authorization": f"Bearer {token}"}, + files={"file": ("map.png", buf.tobytes(), "image/png")}, + data={ + "resolution": str(grid.resolution), + "originX": str(grid.origin.position.x), + "originY": str(grid.origin.position.y), + "width": str(grid.width), + "height": str(grid.height), + }, + timeout=30.0, + ).raise_for_status() + except Exception as e: # noqa: BLE001 — best-effort; never break the stream + logger.warning("map upload failed: %s", e) diff --git a/dimos/agents/skills/take_picture_skill.py b/dimos/agents/skills/take_picture_skill.py index 0f3d351a57..df201328a3 100644 --- a/dimos/agents/skills/take_picture_skill.py +++ b/dimos/agents/skills/take_picture_skill.py @@ -32,6 +32,7 @@ from dimos.core.core import rpc from dimos.core.module import Module, ModuleConfig from dimos.core.stream import In +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped from dimos.msgs.sensor_msgs.Image import Image from dimos.utils.logging_config import setup_logger @@ -46,16 +47,22 @@ class TakePictureSkillConfig(ModuleConfig): class TakePictureSkill(Module): config: TakePictureSkillConfig color_image: In[Image] + odom: In[PoseStamped] @rpc def start(self) -> None: super().start() self._latest: Image | None = None + self._pose: PoseStamped | None = None self.color_image.subscribe(self._on_image) + self.odom.subscribe(self._on_odom) def _on_image(self, image: Image) -> None: self._latest = image + def _on_odom(self, pose: PoseStamped) -> None: + self._pose = pose + @skill def take_picture(self, note: str = "") -> SkillResult: """Capture a photo from the robot's camera and upload it. @@ -79,12 +86,21 @@ def take_picture(self, note: str = "") -> SkillResult: if not ok: return SkillResult.fail("EXECUTION_FAILED", "JPEG encode failed") + # Attach the robot's world position so the web can pin it on the map. + data: dict[str, str] = {"note": note} + pose = getattr(self, "_pose", None) + if pose is not None: + data["poseX"] = str(pose.position.x) + data["poseY"] = str(pose.position.y) + if note: + data["label"] = note + try: resp = httpx.post( f"{url.rstrip('/')}/api/robot/frame", headers={"Authorization": f"Bearer {token}"}, files={"file": ("frame.jpg", buf.tobytes(), "image/jpeg")}, - data={"note": note}, + data=data, timeout=30.0, ) resp.raise_for_status() diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index a61c0af466..eea47ae4c4 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -167,6 +167,7 @@ "local-speak-skill": "dimos.agents.skills.local_speak_skill.LocalSpeakSkill", "manipulation-module": "dimos.manipulation.manipulation_module.ManipulationModule", "map": "dimos.robot.unitree.type.map.Map", + "map-uploader": "dimos.agents.skills.map_uploader.MapUploader", "marker-tf-module": "dimos.perception.fiducial.marker_tf_module.MarkerTfModule", "mcp-client": "dimos.agents.mcp.mcp_client.McpClient", "mcp-server": "dimos.agents.mcp.mcp_server.McpServer", diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py index 1c4e19ff82..bdb025f374 100644 --- a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py @@ -17,6 +17,7 @@ from dimos.agents.mcp.mcp_server import McpServer from dimos.agents.skills.gemini_speak_skill import GeminiSpeakSkill from dimos.agents.skills.person_follow import PersonFollowSkillContainer +from dimos.agents.skills.map_uploader import MapUploader from dimos.agents.skills.speak_skill import SpeakSkill from dimos.agents.skills.take_picture_skill import TakePictureSkill from dimos.core.coordination.blueprints import autoconnect @@ -49,6 +50,7 @@ ), GeminiSpeakSkill.blueprint(), TakePictureSkill.blueprint(), + MapUploader.blueprint(), ).disabled_modules(SecurityModule, SpeakSkill) __all__ = ["unitree_go2_agentic_gemini"] From ff023ca1a0e09b861e0b1da25ab6f4e48d7ef8b3 Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 12:02:44 +0800 Subject: [PATCH 08/29] feat(go2): all-Gemini VL + explore_and_capture + map uploader Consolidate the uncommitted Gemini layer onto the branch: - Gemini VL model (dimos/models/vl/gemini.py) wired into create()/types - --detection-model CLI knob + prefetch - Navigation/PersonFollow vl_model_name wiring - explore_and_capture skill gated on FrontierExplorerSpec - Gemini image/text embeddings raise instead of returning random vectors Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/navigation.py | 16 +- dimos/agents/skills/person_follow.py | 81 +++++++- dimos/agents/skills/take_picture_skill.py | 181 ++++++++++++++---- .../memory/image_embedding.py | 35 ++-- dimos/models/vl/create.py | 4 + dimos/models/vl/gemini.py | 87 +++++++++ dimos/models/vl/types.py | 2 +- .../frontier_explorer_spec.py | 24 +++ dimos/robot/cli/dimos.py | 69 +++++++ .../agentic/unitree_go2_agentic_gemini.py | 14 +- uv.lock | 160 ++++++++++++++++ 11 files changed, 607 insertions(+), 66 deletions(-) create mode 100644 dimos/models/vl/gemini.py create mode 100644 dimos/navigation/frontier_exploration/frontier_explorer_spec.py diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index d88bec452e..111ba27700 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -19,9 +19,11 @@ from dimos.agents.annotation import skill from dimos.core.core import rpc -from dimos.core.module import Module +from dimos.core.module import Module, ModuleConfig from dimos.core.stream import In from dimos.models.qwen.bbox import BBox +from dimos.models.vl.create import create +from dimos.models.vl.types import VlModelName from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped from dimos.msgs.geometry_msgs.Quaternion import Quaternion from dimos.msgs.geometry_msgs.Vector3 import Vector3, make_vector3 @@ -37,7 +39,14 @@ logger = setup_logger() +class Config(ModuleConfig): + # VL model used to detect objects in view for semantic navigation. + vl_model_name: VlModelName = "qwen" + + class NavigationSkillContainer(Module): + config: Config + _latest_image: Image | None = None _latest_odom: PoseStamped | None = None _skill_started: bool = False @@ -54,10 +63,7 @@ def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) self._skill_started = False - # Here to prevent unwanted imports in the file. - from dimos.models.vl.qwen import QwenVlModel - - self._vl_model = QwenVlModel() + self._vl_model = create(self.config.vl_model_name) @rpc def start(self) -> None: diff --git a/dimos/agents/skills/person_follow.py b/dimos/agents/skills/person_follow.py index 4175898e45..a8ee6120f9 100644 --- a/dimos/agents/skills/person_follow.py +++ b/dimos/agents/skills/person_follow.py @@ -15,7 +15,7 @@ import base64 from threading import Event, RLock, Thread import time -from typing import Any +from typing import Any, Literal import numpy as np from reactivex.disposable import Disposable @@ -30,6 +30,7 @@ from dimos.models.segmentation.edge_tam import EdgeTAMProcessor from dimos.models.vl.base import VlModel from dimos.models.vl.create import create +from dimos.models.vl.types import VlModelName from dimos.msgs.geometry_msgs.Twist import Twist from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo from dimos.msgs.sensor_msgs.Image import Image, ImageFormat @@ -46,6 +47,12 @@ class Config(ModuleConfig): camera_info: CameraInfo use_3d_navigation: bool = False + # VL model used for the initial (and, in "redetect" mode, continuous) detection. + vl_model_name: VlModelName = "qwen" + # "edgetam": GPU frame-to-frame tracking (default where CUDA exists). + # "redetect": periodic re-detection with the VL model (no GPU, runs on Mac). + # "auto": pick "edgetam" if CUDA is available, else "redetect". + tracking_mode: Literal["auto", "edgetam", "redetect"] = "auto" class PersonFollowSkillContainer(Module): @@ -66,13 +73,14 @@ class PersonFollowSkillContainer(Module): _frequency: float = 20.0 # Hz - control loop frequency _max_lost_frames: int = 15 # number of frames to wait before declaring person lost + _lost_timeout: float = 5.0 # seconds without a re-detection before declaring person lost _patrolling_module_spec: PatrollingModuleSpec def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) self._latest_image: Image | None = None self._latest_pointcloud: PointCloud2 | None = None - self._vl_model: VlModel = create("qwen") + self._vl_model: VlModel = create(self.config.vl_model_name) self._tracker: EdgeTAMProcessor | None = None self._thread: Thread | None = None self._should_stop: Event = Event() @@ -206,9 +214,24 @@ def _on_pointcloud(self, pointcloud: PointCloud2) -> None: with self._lock: self._latest_pointcloud = pointcloud + def _resolve_tracking_mode(self) -> Literal["edgetam", "redetect"]: + """Resolve the effective tracking mode, auto-selecting by hardware.""" + mode = self.config.tracking_mode + if mode != "auto": + return mode + try: + import torch + + return "edgetam" if torch.cuda.is_available() else "redetect" + except Exception: + return "redetect" + def _follow_person( self, query: str, initial_bbox: BBox, detection_image: Image | None = None ) -> str: + if self._resolve_tracking_mode() == "redetect": + return self._follow_person_redetect(query) + x1, y1, x2, y2 = initial_bbox box = np.array([x1, y1, x2, y2], dtype=np.float32) @@ -310,6 +333,60 @@ def _follow_loop(self, tracker: "EdgeTAMProcessor", query: str) -> None: self._send_stop_reason(query, "it was requested to stop following") + def _follow_person_redetect(self, query: str) -> str: + """Start following using periodic VL re-detection (no GPU tracker). + + Used when no CUDA GPU is available (e.g. on a Mac). The initial detection + has already succeeded by the time this is called; the loop keeps + re-detecting the person with the VL model and servoing toward the bbox. + """ + self.start_tool("follow_person") + self._thread = Thread(target=self._follow_loop_redetect, args=(query,), daemon=True) + self._thread.start() + + message = ( + "Found the person. Starting to follow. You can stop following by calling " + "the 'stop_following' tool. You will receive streaming updates." + ) + + if self._patrolling_module_spec.is_patrolling(): + message += ( + " Note: since the robot was patrolling, this has been stopped automatically " + "(the equivalent of calling the `stop_patrol` tool call) so you don't have " + "to do it. " + ) + self._patrolling_module_spec.stop_patrol() + + return message + + def _follow_loop_redetect(self, query: str) -> None: + # Detection latency (~1-2s/call) paces the loop; we hold the last command + # between detections and only give up after _lost_timeout without a hit. + last_detection_time = time.monotonic() + + while not self._should_stop.is_set(): + with self._lock: + latest_image = self._latest_image + + if latest_image is None: + time.sleep(0.1) + continue + + bbox = get_object_bbox_from_image(self._vl_model, latest_image, query) + + if bbox is None: + if time.monotonic() - last_detection_time > self._lost_timeout: + self.cmd_vel.publish(Twist.zero()) + self._send_stop_reason(query, "lost track of the person") + return + continue + + last_detection_time = time.monotonic() + twist = self._visual_servo.compute_twist(bbox, latest_image.width) + self.cmd_vel.publish(twist) + + self._send_stop_reason(query, "it was requested to stop following") + def _stop_following(self) -> None: self._should_stop.set() diff --git a/dimos/agents/skills/take_picture_skill.py b/dimos/agents/skills/take_picture_skill.py index df201328a3..8629b1fa87 100644 --- a/dimos/agents/skills/take_picture_skill.py +++ b/dimos/agents/skills/take_picture_skill.py @@ -12,10 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Capture the robot's current camera frame and upload it to the robomoo app. +"""Capture robot camera frames and upload them to the robomoo app. -Exposes a `take_picture` skill the agent calls (e.g. "take a picture"); the -latest `color_image` frame is JPEG-encoded and POSTed to robomoo's +Skills: + - take_picture: one-shot capture of the current frame. + - explore_and_capture: start autonomous exploration and take a photo every few + seconds as the robot moves (deterministic — drives the cadence itself rather + than relying on the agent to keep calling take_picture). + +Each frame is JPEG-encoded and POSTed (with the robot's odom pose) to robomoo's `/api/robot/frame` (shared-secret bearer token). Configure via env: ROBOMOO_URL=https://gateway-...up.railway.app @@ -23,17 +28,23 @@ """ import os +import threading +import time import cv2 import httpx +from dimos_lcm.std_msgs import Bool from dimos.agents.annotation import skill from dimos.agents.skill_result import SkillResult from dimos.core.core import rpc from dimos.core.module import Module, ModuleConfig -from dimos.core.stream import In +from dimos.core.stream import In, Out from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped from dimos.msgs.sensor_msgs.Image import Image +from dimos.navigation.frontier_exploration.frontier_explorer_spec import ( + FrontierExplorerSpec, +) from dimos.utils.logging_config import setup_logger logger = setup_logger() @@ -48,66 +59,164 @@ class TakePictureSkill(Module): config: TakePictureSkillConfig color_image: In[Image] odom: In[PoseStamped] + # Drive frontier exploration (consumed by WavefrontFrontierExplorer). + explore_cmd: Out[Bool] + stop_explore_cmd: Out[Bool] + # Auto-wired (structurally) to WavefrontFrontierExplorer — lets us gate the + # capture loop on whether exploration is still running. + _explorer: FrontierExplorerSpec @rpc def start(self) -> None: super().start() self._latest: Image | None = None self._pose: PoseStamped | None = None + self._capture_thread: threading.Thread | None = None + self._capture_stop = threading.Event() self.color_image.subscribe(self._on_image) self.odom.subscribe(self._on_odom) + @rpc + def stop(self) -> None: + self._capture_stop.set() + thread = getattr(self, "_capture_thread", None) + if thread is not None and thread.is_alive(): + thread.join(timeout=5.0) + super().stop() + def _on_image(self, image: Image) -> None: self._latest = image def _on_odom(self, pose: PoseStamped) -> None: self._pose = pose - @skill - def take_picture(self, note: str = "") -> SkillResult: - """Capture a photo from the robot's camera and upload it. + def _configured(self) -> bool: + return bool(self.config.robomoo_url and self.config.ingest_token) - Use whenever the user asks the robot to take or capture a picture or - photo of what it currently sees. `note` is an optional short caption - to tag the image with (e.g. "kitchen", "plant"). - """ + # Encode the latest frame and POST it (with pose) to robomoo. Returns the + # stored key, or None if there's no frame / encode failed. Raises on HTTP error. + def _upload_current(self, note: str = "", label: str = "") -> str | None: frame = getattr(self, "_latest", None) if frame is None: - return SkillResult.fail("NO_FRAME", "No camera frame received yet") - - url = self.config.robomoo_url - token = self.config.ingest_token - if not url or not token: - return SkillResult.fail( - "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" - ) - + return None ok, buf = cv2.imencode(".jpg", frame.data) if not ok: - return SkillResult.fail("EXECUTION_FAILED", "JPEG encode failed") + return None - # Attach the robot's world position so the web can pin it on the map. - data: dict[str, str] = {"note": note} + data: dict[str, str] = {} + if note: + data["note"] = note + if label: + data["label"] = label pose = getattr(self, "_pose", None) if pose is not None: data["poseX"] = str(pose.position.x) data["poseY"] = str(pose.position.y) - if note: - data["label"] = note - try: - resp = httpx.post( - f"{url.rstrip('/')}/api/robot/frame", - headers={"Authorization": f"Bearer {token}"}, - files={"file": ("frame.jpg", buf.tobytes(), "image/jpeg")}, - data=data, - timeout=30.0, + resp = httpx.post( + f"{self.config.robomoo_url.rstrip('/')}/api/robot/frame", + headers={"Authorization": f"Bearer {self.config.ingest_token}"}, + files={"file": ("frame.jpg", buf.tobytes(), "image/jpeg")}, + data=data, + timeout=30.0, + ) + resp.raise_for_status() + return resp.json().get("key", "") + + @skill + def take_picture(self, note: str = "") -> SkillResult: + """Capture a photo from the robot's camera and upload it. + + Use whenever the user asks the robot to take or capture a single picture + or photo of what it currently sees. `note` is an optional short caption + to tag the image with (e.g. "kitchen", "plant"). + """ + if getattr(self, "_latest", None) is None: + return SkillResult.fail("NO_FRAME", "No camera frame received yet") + if not self._configured(): + return SkillResult.fail( + "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" ) - resp.raise_for_status() - except Exception as e: # noqa: BLE001 — report any upload failure to the agent + try: + key = self._upload_current(note=note, label=note) + except Exception as e: # noqa: BLE001 — surface upload failure to the agent logger.exception("take_picture upload failed") return SkillResult.fail("EXECUTION_FAILED", f"upload failed: {e}") - - key = resp.json().get("key", "") + if key is None: + return SkillResult.fail("EXECUTION_FAILED", "JPEG encode failed") logger.info("take_picture uploaded frame key=%s", key) return SkillResult.ok(f"Picture taken and uploaded ({key})", key=key) + + @skill + def explore_and_capture( + self, + interval_s: float = 4.0, + max_duration_s: float = 600.0, + note: str = "exploring", + ) -> SkillResult: + """Explore the room and keep taking photos until exploration is complete. + + Use when the user asks the robot to explore / wander / look around AND + take pictures (or capture/photograph) as it goes. Starts autonomous + frontier exploration, then captures and uploads a frame every + `interval_s` seconds for as long as the robot is still exploring — it + stops on its own once the room is fully explored. `max_duration_s` is + only a safety cap. Returns immediately; capturing runs in the background. + """ + if not self._configured(): + return SkillResult.fail( + "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" + ) + + # Cancel any in-flight run before starting a new one. + self._capture_stop.set() + if self._capture_thread is not None and self._capture_thread.is_alive(): + self._capture_thread.join(timeout=5.0) + self._capture_stop = threading.Event() + + self.explore_cmd.publish(Bool(data=True)) + self._capture_thread = threading.Thread( + target=self._capture_loop, + args=(interval_s, max_duration_s, note), + daemon=True, + name="explore-and-capture", + ) + self._capture_thread.start() + + return SkillResult.ok( + f"Exploring and capturing a photo every {interval_s:.0f}s until the " + "room is fully explored." + ) + + def _exploring(self) -> bool: + try: + return bool(self._explorer.is_exploration_active()) + except Exception as e: # noqa: BLE001 — if the ref errors, keep capturing + logger.warning("is_exploration_active() failed: %s", e) + return True + + def _capture_loop(self, interval_s: float, max_duration_s: float, note: str) -> None: + start = time.monotonic() + grace_s = 6.0 # let exploration spin up before trusting the active flag + count = 0 + try: + while not self._capture_stop.is_set(): + elapsed = time.monotonic() - start + if elapsed > max_duration_s: + logger.info("explore_and_capture hit max_duration_s cap") + break + if elapsed > grace_s and not self._exploring(): + logger.info("exploration finished — stopping capture loop") + break + try: + if self._upload_current(note=note, label=note): + count += 1 + except Exception as e: # noqa: BLE001 — keep going on transient errors + logger.warning("explore_and_capture upload failed: %s", e) + self._capture_stop.wait(interval_s) + finally: + try: + self.stop_explore_cmd.publish(Bool(data=True)) + except Exception: # noqa: BLE001 — best effort on shutdown + pass + logger.info("explore_and_capture finished: uploaded %d photos", count) diff --git a/dimos/agents_deprecated/memory/image_embedding.py b/dimos/agents_deprecated/memory/image_embedding.py index a5086228bf..939159cca8 100644 --- a/dimos/agents_deprecated/memory/image_embedding.py +++ b/dimos/agents_deprecated/memory/image_embedding.py @@ -142,22 +142,22 @@ def get_embedding(self, image: np.ndarray | str | bytes) -> np.ndarray: A numpy array containing the embedding vector """ if self.model is None or self.processor is None: - logger.error("Model not initialized. Using fallback random embedding.") - return np.random.randn(self.dimensions).astype(np.float32) + raise RuntimeError( + f"Image embedding model '{self.model_name}' is not initialized; refusing to " + "return a random vector that would poison the vector store." + ) pil_image = self._prepare_image(image) if self.model_name == "gemini": - try: - from google.genai import types + # No silent fallback: a failed embedding must abort loudly rather than + # writing a random vector into the spatial DB. + from google.genai import types - buf = io.BytesIO() - pil_image.convert("RGB").save(buf, format="PNG") - part = types.Part.from_bytes(data=buf.getvalue(), mime_type="image/png") - return self._gemini_embed_content([part]) - except Exception as e: - logger.error(f"Error generating Gemini image embedding: {e}") - return np.random.randn(self.dimensions).astype(np.float32) + buf = io.BytesIO() + pil_image.convert("RGB").save(buf, format="PNG") + part = types.Part.from_bytes(data=buf.getvalue(), mime_type="image/png") + return self._gemini_embed_content([part]) embedding: np.ndarray try: @@ -229,15 +229,14 @@ def get_text_embedding(self, text: str) -> np.ndarray: A numpy array containing the embedding vector """ if self.model is None or self.processor is None: - logger.error("Model not initialized. Using fallback random embedding.") - return np.random.randn(self.dimensions).astype(np.float32) + raise RuntimeError( + f"Text embedding model '{self.model_name}' is not initialized; refusing to " + "return a random vector that would poison the vector store." + ) if self.model_name == "gemini": - try: - return self._gemini_embed_content(text) - except Exception as e: - logger.error(f"Error generating Gemini text embedding: {e}") - return np.random.randn(self.dimensions).astype(np.float32) + # No silent fallback: surface the failure instead of corrupting the store. + return self._gemini_embed_content(text) if self.model_name != "clip": logger.warning( diff --git a/dimos/models/vl/create.py b/dimos/models/vl/create.py index 362a95b000..f57a113b02 100644 --- a/dimos/models/vl/create.py +++ b/dimos/models/vl/create.py @@ -29,3 +29,7 @@ def create(name: VlModelName) -> VlModel: from dimos.models.vl.moondream import MoondreamVlModel return MoondreamVlModel() + case "gemini": + from dimos.models.vl.gemini import GeminiVlModel + + return GeminiVlModel() diff --git a/dimos/models/vl/gemini.py b/dimos/models/vl/gemini.py new file mode 100644 index 0000000000..0aecf6e1c7 --- /dev/null +++ b/dimos/models/vl/gemini.py @@ -0,0 +1,87 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import cached_property +import os +from typing import TYPE_CHECKING, Any +import warnings + +import numpy as np +from PIL import Image as PILImage + +from dimos.models.vl.base import VlModel, VlModelConfig +from dimos.msgs.sensor_msgs.Image import Image + +if TYPE_CHECKING: + from google import genai + + +class GeminiVlModelConfig(VlModelConfig): + """Configuration for the Gemini VL model.""" + + model_name: str = "gemini-2.5-flash" + api_key: str | None = None + + +class GeminiVlModel(VlModel): + """VL model backed by Google's Gemini API (google-genai). + + Reuses the same ``GOOGLE_API_KEY`` / ``GEMINI_API_KEY`` auth as the Gemini + TTS and embedding components, so no extra key or dependency is required. + """ + + config: GeminiVlModelConfig + + @cached_property + def _client(self) -> "genai.Client": + from google import genai + + api_key = ( + self.config.api_key or os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY") + ) + if not api_key: + raise ValueError( + "Gemini VL model requires GOOGLE_API_KEY or GEMINI_API_KEY to be set" + ) + + return genai.Client(api_key=api_key) + + def _to_pil(self, image: Image | np.ndarray[Any, Any]) -> PILImage.Image: + """Convert dimos Image or numpy array to PIL Image, applying auto_resize.""" + if isinstance(image, np.ndarray): + warnings.warn( + "GeminiVlModel.query should receive standard dimos Image type, not a numpy array", + DeprecationWarning, + stacklevel=2, + ) + image = Image.from_numpy(image) + + image, _ = self._prepare_image(image) + rgb_image = image.to_rgb() + return PILImage.fromarray(rgb_image.data) + + def query(self, image: Image | np.ndarray, query: str) -> str: # type: ignore[override] + pil_image = self._to_pil(image) + + response = self._client.models.generate_content( + model=self.config.model_name, + contents=[pil_image, query], + ) + + return response.text or "" + + def stop(self) -> None: + """Release the Gemini client.""" + if "_client" in self.__dict__: + del self.__dict__["_client"] diff --git a/dimos/models/vl/types.py b/dimos/models/vl/types.py index d20a61ae37..7aa42762fe 100644 --- a/dimos/models/vl/types.py +++ b/dimos/models/vl/types.py @@ -14,4 +14,4 @@ from typing import Literal -VlModelName = Literal["qwen", "moondream"] +VlModelName = Literal["qwen", "moondream", "gemini"] diff --git a/dimos/navigation/frontier_exploration/frontier_explorer_spec.py b/dimos/navigation/frontier_exploration/frontier_explorer_spec.py new file mode 100644 index 0000000000..b08cfcb1cd --- /dev/null +++ b/dimos/navigation/frontier_exploration/frontier_explorer_spec.py @@ -0,0 +1,24 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Protocol + +from dimos.spec.utils import Spec + + +# Module-ref spec for querying frontier-exploration state. Resolves (structurally) +# to WavefrontFrontierExplorer, the only deployed module exposing +# is_exploration_active(). Lets a skill gate work on whether exploration is running. +class FrontierExplorerSpec(Spec, Protocol): + def is_exploration_active(self) -> bool: ... diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py index 4cbb8e1153..8a8504f95a 100644 --- a/dimos/robot/cli/dimos.py +++ b/dimos/robot/cli/dimos.py @@ -422,6 +422,75 @@ def _on_sigint(_sig: int, _frame: object) -> None: typer.echo(format_line(line, json_output=json_output)) +@main.command("prefetch-models") +def prefetch_models( + skip_lfs: bool = typer.Option(False, "--skip-lfs", help="Skip git-LFS model weight pulls"), + skip_whisper: bool = typer.Option(False, "--skip-whisper", help="Skip the Whisper STT model"), +) -> None: + """Pre-download local ML models so they don't cold-download mid-mission. + + Warms the HuggingFace cache (Moondream, CLIP), pulls git-LFS model weights + (YOLO, EdgeTAM, etc.) and the Whisper STT model. Safe to re-run — already + cached models are quick no-ops. Run this once after `uv sync` or before a + mission to avoid the 120s tool-call timeout caused by a 3.85GB Moondream + download happening lazily inside `look_out_for`. + """ + results: list[tuple[str, bool]] = [] + + def _run(label: str, fn: Any) -> None: + typer.echo(f"→ {label} ...") + try: + fn() + results.append((label, True)) + typer.echo(f" ✓ {label}") + except Exception as e: # noqa: BLE001 - report and continue per model + results.append((label, False)) + typer.echo(f" ✗ {label}: {e}", err=True) + + # HuggingFace models: instantiate + start() forces from_pretrained() to download. + def _warm_vl(name: str) -> None: + from dimos.models.vl.create import create + + create(name).start() + + def _warm_clip() -> None: + from dimos.models.embedding.clip import CLIPModel + + CLIPModel().start() + + _run("VL: moondream (vikhyatk/moondream2)", lambda: _warm_vl("moondream")) + _run("Embedding: CLIP (openai/clip-vit-base-patch32)", _warm_clip) + + # git-LFS model weights (pulled into the repo data dir on first access). + if not skip_lfs: + from dimos.utils.data import get_data + + for category in ( + "models_yolo", + "models_yoloe", + "models_edgetam", + "models_mobileclip", + "models_torchreid", + "models_clip", + ): + _run(f"LFS: {category}", lambda c=category: get_data(c)) + + # Whisper STT (matches node_whisper's preferred openai-whisper backend). + if not skip_whisper: + + def _warm_whisper() -> None: + import whisper + + whisper.load_model("base") + + _run("STT: whisper base", _warm_whisper) + + ok = sum(1 for _, success in results if success) + typer.echo(f"\nPrefetch complete: {ok}/{len(results)} succeeded.") + if ok != len(results): + raise typer.Exit(1) + + mcp_app = typer.Typer(help="Interact with the running MCP server") main.add_typer(mcp_app, name="mcp") diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py index bdb025f374..f9b5fc6b9d 100644 --- a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py @@ -16,6 +16,7 @@ from dimos.agents.mcp.mcp_client import McpClient from dimos.agents.mcp.mcp_server import McpServer from dimos.agents.skills.gemini_speak_skill import GeminiSpeakSkill +from dimos.agents.skills.navigation import NavigationSkillContainer from dimos.agents.skills.person_follow import PersonFollowSkillContainer from dimos.agents.skills.map_uploader import MapUploader from dimos.agents.skills.speak_skill import SpeakSkill @@ -35,18 +36,23 @@ # SpatialMemory is re-declared to use remote Gemini embeddings instead of the # local CLIP model (which fails on Apple CoreML); the later atom overrides the # one inside unitree_go2_spatial. -# PersonFollowSkillContainer is likewise re-declared to use Gemini for detection -# (instead of Qwen, which needs ALIBABA_API_KEY); tracking_mode="auto" then -# resolves to periodic re-detection on non-CUDA machines and EdgeTAM on GPU. +# Detection VL is kept LOCAL (Moondream) where it makes sense: +# - look_out_for uses the global detection_model (default "moondream"). +# - PersonFollowSkillContainer is re-declared with vl_model_name="moondream"; +# tracking_mode="auto" resolves to periodic re-detection on non-CUDA machines +# (no EdgeTAM/CUDA) and EdgeTAM on GPU. +# NavigationSkillContainer is re-declared to use Gemini for detection instead of +# Qwen, which would otherwise require ALIBABA_API_KEY. unitree_go2_agentic_gemini = autoconnect( unitree_go2_spatial, SpatialMemory.blueprint(embedding_model="gemini", embedding_dimensions=768), McpServer.blueprint(), McpClient.blueprint(model="google_genai:gemini-2.5-flash"), _common_agentic, + NavigationSkillContainer.blueprint(vl_model_name="gemini"), PersonFollowSkillContainer.blueprint( camera_info=GO2Connection.camera_info_static, - vl_model_name="gemini", + vl_model_name="moondream", ), GeminiSpeakSkill.blueprint(), TakePictureSkill.blueprint(), diff --git a/uv.lock b/uv.lock index 55a2997df3..01c45ff589 100644 --- a/uv.lock +++ b/uv.lock @@ -1883,9 +1883,11 @@ dependencies = [ agents = [ { name = "anthropic" }, { name = "faster-whisper" }, + { name = "google-genai" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -1913,6 +1915,7 @@ all = [ { name = "ffmpeg-python" }, { name = "filterpy" }, { name = "gdown" }, + { name = "google-genai" }, { name = "googlemaps" }, { name = "gtsam-extended" }, { name = "hydra-core" }, @@ -1922,6 +1925,7 @@ all = [ { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -2004,11 +2008,13 @@ base = [ { name = "faster-whisper" }, { name = "ffmpeg-python" }, { name = "filterpy" }, + { name = "google-genai" }, { name = "hydra-core" }, { name = "jinja2" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -2133,12 +2139,14 @@ unitree = [ { name = "faster-whisper" }, { name = "ffmpeg-python" }, { name = "filterpy" }, + { name = "google-genai" }, { name = "gtsam-extended" }, { name = "hydra-core" }, { name = "jinja2" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -2166,12 +2174,14 @@ unitree-dds = [ { name = "faster-whisper" }, { name = "ffmpeg-python" }, { name = "filterpy" }, + { name = "google-genai" }, { name = "gtsam-extended" }, { name = "hydra-core" }, { name = "jinja2" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -2384,6 +2394,7 @@ requires-dist = [ { name = "ffmpeg-python", marker = "extra == 'web'" }, { name = "filterpy", marker = "extra == 'perception'", specifier = ">=1.4.5" }, { name = "gdown", marker = "extra == 'misc'", specifier = ">=5.2.2" }, + { name = "google-genai", marker = "extra == 'agents'", specifier = ">=2,<3" }, { name = "googlemaps", marker = "extra == 'misc'", specifier = ">=4.10.0" }, { name = "gtsam-extended", marker = "extra == 'mapping'", specifier = ">=4.3a1.post1" }, { name = "hydra-core", marker = "extra == 'perception'", specifier = ">=1.3.0" }, @@ -2393,6 +2404,7 @@ requires-dist = [ { name = "langchain", marker = "extra == 'agents'", specifier = ">=1.2.3,<2" }, { name = "langchain-chroma", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-core", marker = "extra == 'agents'", specifier = ">=1.2.22,<2" }, + { name = "langchain-google-genai", marker = "extra == 'agents'", specifier = ">=2,<3" }, { name = "langchain-huggingface", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-ollama", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-openai", marker = "extra == 'agents'", specifier = ">=1,<2" }, @@ -3142,6 +3154,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/10/da216e25ef2f3c9dfa75574aa27f5f4c7e5fb5540308f04e4d8c4d834ecb/filelock-3.23.0-py3-none-any.whl", hash = "sha256:4203c3f43983c7c95e4bbb68786f184f6acb7300899bf99d686bb82d526bdf62", size = 22227, upload-time = "2026-02-14T02:53:56.122Z" }, ] +[[package]] +name = "filetype" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020, upload-time = "2022-11-02T17:34:04.141Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, +] + [[package]] name = "filterpy" version = "1.4.5" @@ -3449,6 +3470,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/9b/4366ad3e1c0688146c70aa6143584d6a8d88583b9390f106250e25a3d5cd/glfw-2.10.0-py2.py3-none-win_amd64.whl", hash = "sha256:7f787ee8645781f10e8800438ce4357ab38c573ffb191aba380c1e72eba6311c", size = 559423, upload-time = "2026-03-10T17:21:34.766Z" }, ] +[[package]] +name = "google-ai-generativelanguage" +version = "0.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core", extra = ["grpc"] }, + { name = "google-auth" }, + { name = "grpcio" }, + { name = "proto-plus" }, + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/36/9365ca196c4753d0d22951ee0147ecb2124382a010d5dbfb9f3ecd57f2cf/google_ai_generativelanguage-0.11.0.tar.gz", hash = "sha256:d9e24e9836e894a85b52ca03d03530988aeb492d48df71cd1573dc1c3b6d81fc", size = 1539696, upload-time = "2026-03-30T22:51:43.143Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/8c/dc1faf90d7a6ec77e09a6706abefabb41ae5b8d838e5e5b16914da26ee1e/google_ai_generativelanguage-0.11.0-py3-none-any.whl", hash = "sha256:f797f307f0969ae49622e09c1d1a23aa86c7538ffae881279506548166e91d45", size = 1432183, upload-time = "2026-03-30T22:47:56.02Z" }, +] + +[[package]] +name = "google-api-core" +version = "2.30.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "googleapis-common-protos" }, + { name = "proto-plus" }, + { name = "protobuf" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/16/ce/502a57fb0ec752026d24df1280b162294b22a0afb98a326084f9a979138b/google_api_core-2.30.3.tar.gz", hash = "sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b", size = 177001, upload-time = "2026-04-10T00:41:28.035Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/15/e56f351cf6ef1cfea58e6ac226a7318ed1deb2218c4b3cc9bd9e4b786c5a/google_api_core-2.30.3-py3-none-any.whl", hash = "sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8", size = 173274, upload-time = "2026-04-09T22:57:16.198Z" }, +] + +[package.optional-dependencies] +grpc = [ + { name = "grpcio" }, + { name = "grpcio-status" }, +] + +[[package]] +name = "google-auth" +version = "2.53.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, + { name = "pyasn1-modules" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/ad/ff781329bbbdc0974a098d996e89c9e1f7024262f9e3eec442fbb9ad1ac6/google_auth-2.53.0.tar.gz", hash = "sha256:e7e6aa16f6bee7b2b264830fd04f08087a1d5a836df516251a5d15327b246c9c", size = 335844, upload-time = "2026-05-15T20:53:07.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/c9/db44165ba7c581268c6d46017ef63339110378305062830104fc7fa144cb/google_auth-2.53.0-py3-none-any.whl", hash = "sha256:6e7449917c599b35126a99ec268ec6880301f2fea41dce198fe8fd83ff642b68", size = 246071, upload-time = "2026-05-15T20:53:05.609Z" }, +] + +[package.optional-dependencies] +requests = [ + { name = "requests" }, +] + [[package]] name = "google-crc32c" version = "1.8.0" @@ -3484,6 +3561,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/97/7d75fe37a7a6ed171a2cf17117177e7aab7e6e0d115858741b41e9dd4254/google_crc32c-1.8.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f639065ea2042d5c034bf258a9f085eaa7af0cd250667c0635a3118e8f92c69c", size = 28800, upload-time = "2025-12-16T00:40:30.322Z" }, ] +[[package]] +name = "google-genai" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "google-auth", extra = ["requests"] }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "sniffio" }, + { name = "tenacity" }, + { name = "typing-extensions" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/91/15fff1b7c22dc0b5b44fa348f151379721353d06a3da22dbbe15bf2c4dd9/google_genai-2.4.0.tar.gz", hash = "sha256:3f8d4bd618be2801e805dc698726731a70f34b438ea25a6c92800eef5b1f513e", size = 552025, upload-time = "2026-05-18T00:25:14.556Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/c8/1b49f6bb69dc7e291ba5d14926937bec4dffcc09ee875822636bd5ef3cf4/google_genai-2.4.0-py3-none-any.whl", hash = "sha256:48df1c44190b05b834fee9cffd360af6d6fd7f68164588e7ebd670fa34f71ee1", size = 818207, upload-time = "2026-05-18T00:25:12.426Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.72.0" @@ -3566,6 +3664,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/b2/b096ccce418882fbfda4f7496f9357aaa9a5af1896a9a7f60d9f2b275a06/grpcio-1.78.0-cp314-cp314-win_amd64.whl", hash = "sha256:dce09d6116df20a96acfdbf85e4866258c3758180e8c49845d6ba8248b6d0bbb", size = 4929852, upload-time = "2026-02-06T09:56:45.885Z" }, ] +[[package]] +name = "grpcio-status" +version = "1.78.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/cd/89ce482a931b543b92cdd9b2888805518c4620e0094409acb8c81dd4610a/grpcio_status-1.78.0.tar.gz", hash = "sha256:a34cfd28101bfea84b5aa0f936b4b423019e9213882907166af6b3bddc59e189", size = 13808, upload-time = "2026-02-06T10:01:48.034Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/8a/1241ec22c41028bddd4a052ae9369267b4475265ad0ce7140974548dc3fa/grpcio_status-1.78.0-py3-none-any.whl", hash = "sha256:b492b693d4bf27b47a6c32590701724f1d3b9444b36491878fb71f6208857f34", size = 14523, upload-time = "2026-02-06T10:01:32.584Z" }, +] + [[package]] name = "gtsam-extended" version = "4.3a1.post1" @@ -4590,6 +4702,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/01/4771b7ab2af1d1aba5b710bd8f13d9225c609425214b357590a17b01be77/langchain_core-1.3.3-py3-none-any.whl", hash = "sha256:18aae8506f37da7f74398492279a7d6efcee4f8e23c4c41c7af080eeb7ef7bd1", size = 543857, upload-time = "2026-05-05T19:02:34.52Z" }, ] +[[package]] +name = "langchain-google-genai" +version = "2.1.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filetype" }, + { name = "google-ai-generativelanguage" }, + { name = "langchain-core" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/38/8b3a71c729bd03e9eb0fd8bdb19e06a074c35bc2eaa61b1b9edfa863f38d/langchain_google_genai-2.1.12.tar.gz", hash = "sha256:4a98371e545eb97fcdf483086a4aebbb8eceeb9597ca5a9c4c35e92f4fbbd271", size = 77566, upload-time = "2025-09-17T01:27:11.747Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/8d/9dd9653e5414e73cae3480e5947bbbbd94ba7fa824efdf46e7ff2c0faef2/langchain_google_genai-2.1.12-py3-none-any.whl", hash = "sha256:4c07630419a8fbe7a2ec512c6dea68289663bfe7d5fae0ba431d2cd59a0d0880", size = 50746, upload-time = "2025-09-17T01:27:10.653Z" }, +] + [[package]] name = "langchain-huggingface" version = "1.2.0" @@ -7637,6 +7764,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, ] +[[package]] +name = "proto-plus" +version = "1.28.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/56/e647b0c675392d2da368da7b6f158f7368b18542fd6f7d7400a2f39de000/proto_plus-1.28.0.tar.gz", hash = "sha256:38e5696342835b08fc116f30a25665b29531cda9d5d5643e9b81fc312385abd9", size = 57221, upload-time = "2026-05-07T08:04:50.811Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/20/b122d4626976acb81132036d2ad1bb35a1a8775fceb837ec30964622516a/proto_plus-1.28.0-py3-none-any.whl", hash = "sha256:a630604310899e73c59ec302e5765c058d412b2f090b9c79c8822589f14955b8", size = 50410, upload-time = "2026-05-07T08:03:31.962Z" }, +] + [[package]] name = "protobuf" version = "6.33.5" @@ -7842,6 +7981,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/9c/47693463894b610f8439b2e970b82ef81e9599c757bf2049365e40ff963c/pyarrow-23.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:427deac1f535830a744a4f04a6ac183a64fcac4341b3f618e693c41b7b98d2b0", size = 28338905, upload-time = "2026-01-18T16:19:32.93Z" }, ] +[[package]] +name = "pyasn1" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, +] + [[package]] name = "pyaudio" version = "0.2.14" From 0b0286db68ba71fa181081b3896d04f2b85966cb Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 12:06:42 +0800 Subject: [PATCH 09/29] feat(go2): all-Gemini VL (no local Moondream) for Mac-only agentic blueprint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dog has no onboard brain — all compute runs on the Mac. Moondream is unusable there (~6 min/inference + Metal crash, which aborted the blueprint at startup and looked like a connection failure). Route every VL path to Gemini: PersonFollow vl_model_name moondream->gemini, plus .global_config(detection_model=gemini) for look_out_for / PerceiveLoop. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../agentic/unitree_go2_agentic_gemini.py | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py index f9b5fc6b9d..1a5e435223 100644 --- a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py @@ -36,27 +36,30 @@ # SpatialMemory is re-declared to use remote Gemini embeddings instead of the # local CLIP model (which fails on Apple CoreML); the later atom overrides the # one inside unitree_go2_spatial. -# Detection VL is kept LOCAL (Moondream) where it makes sense: -# - look_out_for uses the global detection_model (default "moondream"). -# - PersonFollowSkillContainer is re-declared with vl_model_name="moondream"; -# tracking_mode="auto" resolves to periodic re-detection on non-CUDA machines -# (no EdgeTAM/CUDA) and EdgeTAM on GPU. -# NavigationSkillContainer is re-declared to use Gemini for detection instead of -# Qwen, which would otherwise require ALIBABA_API_KEY. -unitree_go2_agentic_gemini = autoconnect( - unitree_go2_spatial, - SpatialMemory.blueprint(embedding_model="gemini", embedding_dimensions=768), - McpServer.blueprint(), - McpClient.blueprint(model="google_genai:gemini-2.5-flash"), - _common_agentic, - NavigationSkillContainer.blueprint(vl_model_name="gemini"), - PersonFollowSkillContainer.blueprint( - camera_info=GO2Connection.camera_info_static, - vl_model_name="moondream", - ), - GeminiSpeakSkill.blueprint(), - TakePictureSkill.blueprint(), - MapUploader.blueprint(), -).disabled_modules(SecurityModule, SpeakSkill) +# ALL compute runs on the Mac (the dog has no onboard brain), so VL detection is +# ALSO Gemini — no local Moondream (~6 min/inference + crashes Metal on Apple +# Silicon, which aborted the blueprint at startup): +# - .global_config(detection_model="gemini") drives look_out_for / PerceiveLoop. +# - NavigationSkillContainer + PersonFollowSkillContainer use vl_model_name="gemini" +# (these read the per-skill name, not the global knob). +unitree_go2_agentic_gemini = ( + autoconnect( + unitree_go2_spatial, + SpatialMemory.blueprint(embedding_model="gemini", embedding_dimensions=768), + McpServer.blueprint(), + McpClient.blueprint(model="google_genai:gemini-2.5-flash"), + _common_agentic, + NavigationSkillContainer.blueprint(vl_model_name="gemini"), + PersonFollowSkillContainer.blueprint( + camera_info=GO2Connection.camera_info_static, + vl_model_name="gemini", + ), + GeminiSpeakSkill.blueprint(), + TakePictureSkill.blueprint(), + MapUploader.blueprint(), + ) + .global_config(detection_model="gemini") + .disabled_modules(SecurityModule, SpeakSkill) +) __all__ = ["unitree_go2_agentic_gemini"] From 0d3cc4818b77ae96f6787f23adbd7251360ef1fa Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 14:10:01 +0800 Subject: [PATCH 10/29] feat(speak): non-blocking by default + fix per-call TTS thread leak GeminiSpeakSkill called GeminiTTSNode.consume_text() on every speak(), which spawned a fresh worker thread + subscription each call (the repeated 'Starting GeminiTTSNode' log) and leaked the old ones. Wire the TTS pipeline ONCE in start() against a long-lived Subject; each speak() just pushes onto it and the node drains FIFO. Default speak() to blocking=False so the agent isn't stalled on synthesis; blocking=True still waits, matching on the emitted utterance so a concurrent non-blocking speak can't trip the wait. Make consume_text idempotent as defense-in-depth. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/gemini_speak_skill.py | 85 ++++++++++------------- dimos/stream/audio/tts/node_gemini.py | 13 ++++ 2 files changed, 50 insertions(+), 48 deletions(-) diff --git a/dimos/agents/skills/gemini_speak_skill.py b/dimos/agents/skills/gemini_speak_skill.py index 2d53202008..aecdad9cbe 100644 --- a/dimos/agents/skills/gemini_speak_skill.py +++ b/dimos/agents/skills/gemini_speak_skill.py @@ -26,7 +26,6 @@ from reactivex import Subject from dimos.agents.annotation import skill -from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT from dimos.core.core import rpc from dimos.core.module import Module, ModuleConfig from dimos.stream.audio.node_output import SounddeviceAudioOutput @@ -48,8 +47,7 @@ class GeminiSpeakSkill(Module): _tts_node: GeminiTTSNode | None = None _audio_output: SounddeviceAudioOutput | None = None _audio_lock: threading.Lock = threading.Lock() - _bg_threads: list[threading.Thread] = [] - _bg_threads_lock: threading.Lock = threading.Lock() + _text_subject: "Subject[str] | None" = None @rpc def start(self) -> None: @@ -57,100 +55,91 @@ def start(self) -> None: self._tts_node = GeminiTTSNode(voice=self.config.voice, model=self.config.model) self._audio_output = SounddeviceAudioOutput(sample_rate=24000) self._audio_output.consume_audio(self._tts_node.emit_audio()) + # Wire the text pipeline ONCE. Each speak() just pushes onto this subject; + # the TTS node's own worker drains it FIFO. (Previously consume_text was + # called per speak(), spawning a fresh worker thread + subscription every + # call — a leak, and the source of the repeated "Starting GeminiTTSNode".) + self._text_subject = Subject() + self._tts_node.consume_text(self._text_subject) @rpc def stop(self) -> None: - with self._bg_threads_lock: - threads = list(self._bg_threads) - for t in threads: - t.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) if self._tts_node: + # dispose() clears the queue and joins the worker, so in-flight/queued + # speech is torn down here — no separate bg-thread bookkeeping needed. self._tts_node.dispose() self._tts_node = None if self._audio_output: self._audio_output.stop() self._audio_output = None + self._text_subject = None super().stop() @skill - def speak(self, text: str, blocking: bool = True) -> str: + def speak(self, text: str, blocking: bool = False) -> str: """Speak text out loud through the robot's speakers. USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak. Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly. + Returns immediately by default (the audio plays in the background); pass + ``blocking=True`` only when you must wait until the utterance finishes. + Example usage: speak("Hello, I am your robot assistant.") """ - if self._tts_node is None: + if self._tts_node is None or self._text_subject is None: return "Error: TTS not initialized" if not blocking: - thread = threading.Thread( - target=self._speak_bg, args=(text,), daemon=True, name="GeminiSpeakSkill-bg" - ) - with self._bg_threads_lock: - self._bg_threads.append(thread) - thread.start() + # Fire-and-forget: enqueue on the shared pipeline and return now. + self._text_subject.on_next(text) return f"Speaking (non-blocking): {text}" return self._speak_blocking(text) - def _speak_bg(self, text: str) -> None: - try: - self._speak_blocking(text) - finally: - # Remove this thread from the list of background threads when done - with self._bg_threads_lock: - self._bg_threads = [ - t for t in self._bg_threads if t is not threading.current_thread() - ] - def _speak_blocking(self, text: str) -> str: - # Use lock to prevent simultaneous speech + # Serialize blocking speech so utterances don't overlap on the speaker. with self._audio_lock: - if self._tts_node is None: + if self._tts_node is None or self._text_subject is None: return "Error: TTS not initialized" - text_subject: Subject[str] = Subject() audio_complete = threading.Event() - self._tts_node.consume_text(text_subject) - def set_as_complete(_t: str) -> None: - audio_complete.set() + # emit_text() re-emits the exact utterance once its synthesis finishes; + # match on the text so a concurrent non-blocking speak can't trip us. + def on_text(t: str) -> None: + if t == text: + audio_complete.set() - def set_as_complete_e(_e: Exception) -> None: + def on_error(_e: Exception) -> None: audio_complete.set() subscription = self._tts_node.emit_text().subscribe( - on_next=set_as_complete, - on_error=set_as_complete_e, + on_next=on_text, + on_error=on_error, ) - text_subject.on_next(text) - text_subject.on_completed() + self._text_subject.on_next(text) # Gemini synthesis is a network round-trip; allow more headroom than # the local-output time so first-token latency doesn't trip the wait. timeout = max(15, len(text) * 0.1) - - if not audio_complete.wait(timeout=timeout): - logger.warning(f"TTS timeout reached for: {text}") - subscription.dispose() - return f"Warning: TTS timeout while speaking: {text}" - else: - # Small delay to ensure buffers flush + try: + if not audio_complete.wait(timeout=timeout): + logger.warning(f"TTS timeout reached for: {text}") + return f"Warning: TTS timeout while speaking: {text}" + # Small delay to ensure buffers flush. time.sleep(0.3) - - subscription.dispose() - - return f"Spoke: {text}" + return f"Spoke: {text}" + finally: + subscription.dispose() if __name__ == "__main__": skill_module = GeminiSpeakSkill() skill_module.start() - print(skill_module.speak("Hello, I am your robot assistant, powered by Gemini.")) + print(skill_module.speak("Hello, I am your robot assistant, powered by Gemini.", blocking=True)) skill_module.stop() diff --git a/dimos/stream/audio/tts/node_gemini.py b/dimos/stream/audio/tts/node_gemini.py index 78d4194d08..6563fb6f9d 100644 --- a/dimos/stream/audio/tts/node_gemini.py +++ b/dimos/stream/audio/tts/node_gemini.py @@ -113,6 +113,19 @@ def consume_text(self, text_observable: Observable) -> "AbstractTextConsumer": Returns: Self for method chaining """ + # Idempotent: start the worker + subscription only once. This used to be + # called per utterance, leaking a new thread + subscription each time + # (and re-logging "Starting GeminiTTSNode" on every speak). If called + # again with a new source, just re-point the subscription. + if self.processing_thread is not None and self.processing_thread.is_alive(): + if self.subscription is not None: + self.subscription.dispose() + self.subscription = text_observable.subscribe( # type: ignore[assignment] + on_next=self._queue_text, + on_error=lambda e: logger.error(f"Error in GeminiTTSNode: {e}"), + ) + return self + logger.info("Starting GeminiTTSNode") # Start the processing thread From b45399cc05dc7bef39d87d1b35a01fc6fdf2eb43 Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 14:10:52 +0800 Subject: [PATCH 11/29] feat(take_picture): fire-and-forget upload (return immediately) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit take_picture blocked the agent on a synchronous httpx.post (up to 30s) before returning. Snapshot the current frame+pose synchronously, then dispatch the JPEG encode + POST to a daemon thread and return at once. Refactor the upload into _upload_frame(frame, pose, ...) shared by both the skill and the explore capture loop (via a thin _upload_current wrapper); track outstanding upload threads and join them in stop(). Trade-off: the skill no longer returns the storage key or surfaces HTTP errors to the agent — upload failures are logged only. That is the intended cost of returning fast. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/take_picture_skill.py | 65 +++++++++++++++++------ 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/dimos/agents/skills/take_picture_skill.py b/dimos/agents/skills/take_picture_skill.py index 8629b1fa87..0f9da19f1c 100644 --- a/dimos/agents/skills/take_picture_skill.py +++ b/dimos/agents/skills/take_picture_skill.py @@ -73,6 +73,9 @@ def start(self) -> None: self._pose: PoseStamped | None = None self._capture_thread: threading.Thread | None = None self._capture_stop = threading.Event() + # Outstanding fire-and-forget upload threads from take_picture(). + self._uploads: list[threading.Thread] = [] + self._uploads_lock = threading.Lock() self.color_image.subscribe(self._on_image) self.odom.subscribe(self._on_odom) @@ -82,6 +85,11 @@ def stop(self) -> None: thread = getattr(self, "_capture_thread", None) if thread is not None and thread.is_alive(): thread.join(timeout=5.0) + with self._uploads_lock: + uploads = list(self._uploads) + for t in uploads: + if t.is_alive(): + t.join(timeout=5.0) super().stop() def _on_image(self, image: Image) -> None: @@ -93,10 +101,15 @@ def _on_odom(self, pose: PoseStamped) -> None: def _configured(self) -> bool: return bool(self.config.robomoo_url and self.config.ingest_token) - # Encode the latest frame and POST it (with pose) to robomoo. Returns the - # stored key, or None if there's no frame / encode failed. Raises on HTTP error. - def _upload_current(self, note: str = "", label: str = "") -> str | None: - frame = getattr(self, "_latest", None) + # Encode a given frame and POST it (with pose) to robomoo. Returns the stored + # key, or None if there's no frame / encode failed. Raises on HTTP error. + def _upload_frame( + self, + frame: Image | None, + pose: PoseStamped | None, + note: str = "", + label: str = "", + ) -> str | None: if frame is None: return None ok, buf = cv2.imencode(".jpg", frame.data) @@ -108,7 +121,6 @@ def _upload_current(self, note: str = "", label: str = "") -> str | None: data["note"] = note if label: data["label"] = label - pose = getattr(self, "_pose", None) if pose is not None: data["poseX"] = str(pose.position.x) data["poseY"] = str(pose.position.y) @@ -123,29 +135,50 @@ def _upload_current(self, note: str = "", label: str = "") -> str | None: resp.raise_for_status() return resp.json().get("key", "") + # Thin wrapper used by the explore capture loop: upload the latest frame/pose. + def _upload_current(self, note: str = "", label: str = "") -> str | None: + return self._upload_frame( + getattr(self, "_latest", None), + getattr(self, "_pose", None), + note=note, + label=label, + ) + @skill def take_picture(self, note: str = "") -> SkillResult: """Capture a photo from the robot's camera and upload it. Use whenever the user asks the robot to take or capture a single picture or photo of what it currently sees. `note` is an optional short caption - to tag the image with (e.g. "kitchen", "plant"). + to tag the image with (e.g. "kitchen", "plant"). Returns immediately; the + encode + upload happen in the background. """ - if getattr(self, "_latest", None) is None: + frame = getattr(self, "_latest", None) + if frame is None: return SkillResult.fail("NO_FRAME", "No camera frame received yet") if not self._configured(): return SkillResult.fail( "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" ) - try: - key = self._upload_current(note=note, label=note) - except Exception as e: # noqa: BLE001 — surface upload failure to the agent - logger.exception("take_picture upload failed") - return SkillResult.fail("EXECUTION_FAILED", f"upload failed: {e}") - if key is None: - return SkillResult.fail("EXECUTION_FAILED", "JPEG encode failed") - logger.info("take_picture uploaded frame key=%s", key) - return SkillResult.ok(f"Picture taken and uploaded ({key})", key=key) + + # Snapshot the frame + pose now so the background upload sends exactly what + # the robot saw at call time, not a later frame. + pose = getattr(self, "_pose", None) + + def _bg() -> None: + try: + key = self._upload_frame(frame, pose, note=note, label=note) + logger.info("take_picture uploaded frame key=%s", key) + except Exception: # noqa: BLE001 — fire-and-forget: failures only logged + logger.exception("take_picture upload failed") + + t = threading.Thread(target=_bg, daemon=True, name="take-picture-upload") + with self._uploads_lock: + # Drop finished threads so the list doesn't grow unbounded. + self._uploads = [u for u in self._uploads if u.is_alive()] + self._uploads.append(t) + t.start() + return SkillResult.ok("Picture captured; uploading in the background.") @skill def explore_and_capture( From f92dd89364a9d7528493ad9d200eb9d0695cb08e Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 14:12:16 +0800 Subject: [PATCH 12/29] =?UTF-8?q?feat(go2):=20tilt=5Fbody=20skill=20?= =?UTF-8?q?=E2=80=94=20aim=20camera=20up/down=20via=20Euler=20body=20pitch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Go2 camera is body-fixed; the only way to look up/down is to pitch the body with the Euler sport command (api_id 1007), which was not exposed and needs a parameter payload execute_sport_command can't send. Add a tilt_body(pitch_deg, roll_deg, yaw_deg) skill that publishes Euler to SPORT_MOD with the {"parameter": {"data": {x,y,z}}} payload (same publish_request path execute_sport_command uses), converting degrees->radians and clamping to the safe standing envelope (±0.75/±0.75/±0.6 rad). Negative pitch looks up. Note: pitch sign and the {data:{x,y,z}} vs flat {x,y,z} payload form should be confirmed on-robot; the clamp is the safety net. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../robot/unitree/unitree_skill_container.py | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/dimos/robot/unitree/unitree_skill_container.py b/dimos/robot/unitree/unitree_skill_container.py index 88194473e6..160a1b268a 100644 --- a/dimos/robot/unitree/unitree_skill_container.py +++ b/dimos/robot/unitree/unitree_skill_container.py @@ -19,7 +19,7 @@ import math import time -from unitree_webrtc_connect.constants import RTC_TOPIC +from unitree_webrtc_connect.constants import RTC_TOPIC, SPORT_CMD from dimos.agents.annotation import skill from dimos.core.core import rpc @@ -300,6 +300,37 @@ def execute_sport_command(self, command_name: str) -> str: logger.error(f"Failed to execute {command_name}: {e}") return "Failed to execute the command." + @skill + def tilt_body( + self, pitch_deg: float = 0.0, roll_deg: float = 0.0, yaw_deg: float = 0.0 + ) -> str: + """Tilt the robot's body to aim its (fixed) camera up or down. + + The Go2's camera is body-mounted, so "looking up/down" means pitching the + whole body. NEGATIVE pitch_deg looks UP, positive looks DOWN. Useful range + is about -40..40 degrees; roll/yaw are optional. The robot must be standing + first (run StandUp or BalanceStand). This is a held pose — call tilt_body() + with no args to return level. + + Example: tilt_body(pitch_deg=-20) # look up at e.g. a tabletop + """ + # Clamp to the Go2's safe standing envelope (radians). + roll = max(-0.75, min(0.75, math.radians(float(roll_deg)))) + pitch = max(-0.75, min(0.75, math.radians(float(pitch_deg)))) + yaw = max(-0.6, min(0.6, math.radians(float(yaw_deg)))) + try: + self._connection.publish_request( + RTC_TOPIC["SPORT_MOD"], + { + "api_id": SPORT_CMD["Euler"], + "parameter": {"data": {"x": roll, "y": pitch, "z": yaw}}, + }, + ) + return f"Body tilted (pitch={pitch_deg}, roll={roll_deg}, yaw={yaw_deg} deg)." + except Exception as e: + logger.error(f"Failed to tilt body: {e}") + return "Failed to tilt the body." + _commands = "\n".join( [f'- "{name}": {description}' for name, (_, description) in _UNITREE_COMMANDS.items()] From 30a09bc379a3d50bd447c4bf7b7e4e9dfd3165a8 Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 14:14:51 +0800 Subject: [PATCH 13/29] feat(follow_person): local tracker between Gemini re-detections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a Mac (no CUDA) follow_person falls back to 'redetect', which ran a full synchronous Gemini detection every control cycle — capping the loop at ~0.5-1 Hz so the robot acted on stale velocity commands (the laggy feel). Decouple the two: a background thread re-detects every _redetect_period (0.8s) to anchor a cheap local OpenCV tracker, while the control loop runs at _frequency (20 Hz), updating the tracker locally and publishing a fresh twist each cycle. _create_tracker() picks the best available tracker (CSRT > KCF > MIL, across the main and legacy cv2 namespaces) so it auto-upgrades to CSRT where opencv-contrib exists and falls back to MIL (base OpenCV) here. Lost handling keeps the existing _lost_timeout semantics. The EdgeTAM (CUDA) path and auto mode-resolution are untouched, so GPU machines are unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/person_follow.py | 118 +++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 16 deletions(-) diff --git a/dimos/agents/skills/person_follow.py b/dimos/agents/skills/person_follow.py index a8ee6120f9..35ec768fa2 100644 --- a/dimos/agents/skills/person_follow.py +++ b/dimos/agents/skills/person_follow.py @@ -74,6 +74,7 @@ class PersonFollowSkillContainer(Module): _frequency: float = 20.0 # Hz - control loop frequency _max_lost_frames: int = 15 # number of frames to wait before declaring person lost _lost_timeout: float = 5.0 # seconds without a re-detection before declaring person lost + _redetect_period: float = 0.8 # seconds between VL re-anchors in "redetect" mode _patrolling_module_spec: PatrollingModuleSpec def __init__(self, **kwargs: Any) -> None: @@ -359,34 +360,119 @@ def _follow_person_redetect(self, query: str) -> str: return message - def _follow_loop_redetect(self, query: str) -> None: - # Detection latency (~1-2s/call) paces the loop; we hold the last command - # between detections and only give up after _lost_timeout without a hit. - last_detection_time = time.monotonic() + @staticmethod + def _create_tracker() -> Any: + """Best available OpenCV single-object tracker, or None. - while not self._should_stop.is_set(): - with self._lock: - latest_image = self._latest_image + Prefers CSRT > KCF > MIL and checks both the main and legacy namespaces, + so it auto-upgrades to CSRT where opencv-contrib is installed and falls + back to MIL (present in base OpenCV) otherwise. + """ + import cv2 - if latest_image is None: - time.sleep(0.1) + for ns in (cv2, getattr(cv2, "legacy", None)): + if ns is None: continue + for name in ("TrackerCSRT_create", "TrackerKCF_create", "TrackerMIL_create"): + ctor = getattr(ns, name, None) + if ctor is not None: + return ctor() + return None - bbox = get_object_bbox_from_image(self._vl_model, latest_image, query) + def _follow_loop_redetect(self, query: str) -> None: + """Follow without a GPU tracker (e.g. on a Mac). - if bbox is None: - if time.monotonic() - last_detection_time > self._lost_timeout: + A single Gemini detection takes ~1-2s, so running it inline would cap the + control loop at ~0.5-1 Hz and the robot would act on stale commands. Instead + a background thread re-detects every ``_redetect_period`` seconds to *anchor* + a cheap local OpenCV tracker, while this control loop runs at ``_frequency`` + Hz: each cycle it updates the tracker locally and publishes a fresh twist. + """ + det_lock = RLock() + # Latest VL detection handed from the detect thread to the control loop. + det: dict[str, Any] = {"bbox": None, "image": None, "seq": 0} + + def _detect_worker() -> None: + while not self._should_stop.is_set(): + with self._lock: + img = self._latest_image + if img is None: + self._should_stop.wait(0.05) + continue + bbox = get_object_bbox_from_image(self._vl_model, img, query) + if bbox is not None: + with det_lock: + det["bbox"] = bbox + det["image"] = img + det["seq"] += 1 + # The call itself already takes ~1-2s; pace re-anchors on top. + self._should_stop.wait(self._redetect_period) + + det_thread = Thread(target=_detect_worker, daemon=True, name="follow-redetect") + det_thread.start() + + period = 1.0 / self._frequency + next_time = time.monotonic() + last_good_time = time.monotonic() + tracker: Any = None + last_seq = -1 + + try: + while not self._should_stop.is_set(): + next_time += period + + with self._lock: + latest_image = self._latest_image + if latest_image is None: + self._sleep_until(next_time) + continue + + with det_lock: + seq, det_bbox, det_image = det["seq"], det["bbox"], det["image"] + + bbox = None + if seq != last_seq and det_bbox is not None: + # Fresh detection: (re)anchor the local tracker on it. + last_seq = seq + tracker = self._create_tracker() + if tracker is not None: + anchor = det_image.data if det_image is not None else latest_image.data + x1, y1, x2, y2 = det_bbox + try: + tracker.init(anchor, (int(x1), int(y1), int(x2 - x1), int(y2 - y1))) + except Exception as e: # noqa: BLE001 — tracker init is best-effort + logger.warning("tracker init failed, detection-only: %s", e) + tracker = None + bbox = det_bbox + last_good_time = time.monotonic() + elif tracker is not None: + # Between detections: cheap local update keeps the loop at rate. + ok, box = tracker.update(latest_image.data) + if ok: + x, y, w, h = box + bbox = (float(x), float(y), float(x + w), float(y + h)) + last_good_time = time.monotonic() + + if bbox is not None: + twist = self._visual_servo.compute_twist(bbox, latest_image.width) + self.cmd_vel.publish(twist) + elif time.monotonic() - last_good_time > self._lost_timeout: self.cmd_vel.publish(Twist.zero()) self._send_stop_reason(query, "lost track of the person") return - continue + # else: no bbox yet but within timeout — hold the last command. - last_detection_time = time.monotonic() - twist = self._visual_servo.compute_twist(bbox, latest_image.width) - self.cmd_vel.publish(twist) + self._sleep_until(next_time) + finally: + det_thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) self._send_stop_reason(query, "it was requested to stop following") + def _sleep_until(self, next_time: float) -> None: + sleep_duration = next_time - time.monotonic() + if sleep_duration > 0: + time.sleep(sleep_duration) + def _stop_following(self) -> None: self._should_stop.set() From e5b307a8a3fe60a6d7871476971ee8bb6a81c04d Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 15:24:38 +0800 Subject: [PATCH 14/29] =?UTF-8?q?feat(take=5Fpicture):=20tilt=5Fand=5Fcapt?= =?UTF-8?q?ure=20=E2=80=94=20aim=20camera,=20photograph,=20re-level=20in?= =?UTF-8?q?=20one=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tilting then capturing as separate agent calls is racy: tilt_body returns before the body settles and take_picture snapshots instantly, so the photo can catch a mid-tilt view. Add a deterministic tilt_and_capture(pitch_deg=-20, note, settle_s=1.0) that runs the whole sequence in the background — tilt, wait to settle, snapshot + upload the tilted view, then re-level — and returns at once. Reuses the existing ModuleRef-spec pattern: a new TiltSpec resolves structurally to UnitreeSkillContainer.tilt_body (which @skill exposes over rpc), so the capture module can aim the body-fixed camera without owning the WebRTC connection. Negative pitch looks up. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/take_picture_skill.py | 57 +++++++++++++++++++++++ dimos/robot/unitree/tilt_spec.py | 26 +++++++++++ 2 files changed, 83 insertions(+) create mode 100644 dimos/robot/unitree/tilt_spec.py diff --git a/dimos/agents/skills/take_picture_skill.py b/dimos/agents/skills/take_picture_skill.py index 0f9da19f1c..1920254987 100644 --- a/dimos/agents/skills/take_picture_skill.py +++ b/dimos/agents/skills/take_picture_skill.py @@ -45,6 +45,7 @@ from dimos.navigation.frontier_exploration.frontier_explorer_spec import ( FrontierExplorerSpec, ) +from dimos.robot.unitree.tilt_spec import TiltSpec from dimos.utils.logging_config import setup_logger logger = setup_logger() @@ -65,6 +66,9 @@ class TakePictureSkill(Module): # Auto-wired (structurally) to WavefrontFrontierExplorer — lets us gate the # capture loop on whether exploration is still running. _explorer: FrontierExplorerSpec + # Auto-wired (structurally) to UnitreeSkillContainer.tilt_body — lets + # tilt_and_capture aim the body-fixed camera without owning the connection. + _tilt: TiltSpec @rpc def start(self) -> None: @@ -180,6 +184,59 @@ def _bg() -> None: t.start() return SkillResult.ok("Picture captured; uploading in the background.") + @skill + def tilt_and_capture( + self, + pitch_deg: float = -20.0, + note: str = "", + settle_s: float = 1.0, + ) -> SkillResult: + """Tilt the body to aim the camera, photograph that view, then re-level. + + Use to photograph things above or below the robot's straight-ahead view + (the camera is body-fixed). NEGATIVE pitch_deg looks UP, positive looks + DOWN. Runs in the background: tilts, waits `settle_s` for the body to + settle, captures + uploads the tilted view, then returns the body to + level. Returns immediately. The robot should be standing first. + """ + if getattr(self, "_latest", None) is None: + return SkillResult.fail("NO_FRAME", "No camera frame received yet") + if not self._configured(): + return SkillResult.fail( + "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" + ) + + def _bg() -> None: + try: + self._tilt.tilt_body(pitch_deg=pitch_deg) + # Wait for the body to physically reach the pose and a fresh + # camera frame to arrive before snapshotting. + time.sleep(settle_s) + key = self._upload_frame( + getattr(self, "_latest", None), + getattr(self, "_pose", None), + note=note, + label=note, + ) + logger.info("tilt_and_capture uploaded frame key=%s", key) + except Exception: # noqa: BLE001 — fire-and-forget: failures only logged + logger.exception("tilt_and_capture failed") + finally: + # Always return the body to level, even if capture failed. + try: + self._tilt.tilt_body() + except Exception: # noqa: BLE001 — best effort + logger.exception("tilt_and_capture re-level failed") + + t = threading.Thread(target=_bg, daemon=True, name="tilt-and-capture") + with self._uploads_lock: + self._uploads = [u for u in self._uploads if u.is_alive()] + self._uploads.append(t) + t.start() + return SkillResult.ok( + f"Tilting to pitch={pitch_deg} deg, capturing, then re-leveling (background)." + ) + @skill def explore_and_capture( self, diff --git a/dimos/robot/unitree/tilt_spec.py b/dimos/robot/unitree/tilt_spec.py new file mode 100644 index 0000000000..a1ef528c74 --- /dev/null +++ b/dimos/robot/unitree/tilt_spec.py @@ -0,0 +1,26 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Protocol + +from dimos.spec.utils import Spec + + +# Module-ref spec for tilting the robot body. Resolves (structurally) to +# UnitreeSkillContainer.tilt_body, letting another module (e.g. TakePictureSkill) +# aim the body-fixed camera without owning the WebRTC connection. +class TiltSpec(Spec, Protocol): + def tilt_body( + self, pitch_deg: float = 0.0, roll_deg: float = 0.0, yaw_deg: float = 0.0 + ) -> str: ... From e120a321ca02231528e172e6237373a9e3b81cbe Mon Sep 17 00:00:00 2001 From: grmkris Date: Wed, 27 May 2026 16:33:30 +0800 Subject: [PATCH 15/29] =?UTF-8?q?feat(go2):=20value-encoded=20occupancy=20?= =?UTF-8?q?upload=20+=20recording=E2=86=92robomoo=20exporter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit map_uploader now ships a value-preserving grayscale occupancy PNG (free=0, occupied=1..100, unknown=255) instead of a baked turbo image, so the web can recolor + overlay client-side. Add scripts/export_recording.py: reads a memory2 SqliteStore .db and pushes a top-down lidar occupancy map, downsampled odom trajectory, and CLIP-embedded keyframes (+thumbnails+pose) to robomoo's /api/robot/*. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/agents/skills/map_uploader.py | 18 ++- scripts/export_recording.py | 235 ++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+), 7 deletions(-) create mode 100644 scripts/export_recording.py diff --git a/dimos/agents/skills/map_uploader.py b/dimos/agents/skills/map_uploader.py index 945fef928b..9cbcde99f1 100644 --- a/dimos/agents/skills/map_uploader.py +++ b/dimos/agents/skills/map_uploader.py @@ -14,10 +14,11 @@ """Periodically upload the 2D occupancy map (global_costmap) to the robomoo app. -Subscribes to `global_costmap`, renders it to a PNG with the existing -`turbo_image` colormap (same look as the command-center), throttles, and POSTs -it plus grid metadata (resolution, origin, width, height) to robomoo's -`/api/robot/map`. The web app overlays capture pins on it via +Subscribes to `global_costmap`, encodes it as a *value-preserving* grayscale PNG +(free=0, occupied=1..100, unknown=255 — NOT a pre-colored image), throttles, and +POSTs it plus grid metadata (resolution, origin, width, height) to robomoo's +`/api/robot/map`. The web app reads the raw cell values back and applies its own +colormap + overlays, mapping world→pixel via `col = (x - originX) / resolution`, `row = (y - originY) / resolution`. Env: ROBOMOO_URL, ROBOT_INGEST_TOKEN. @@ -27,12 +28,12 @@ import time import cv2 +import numpy as np import httpx from dimos.core.core import rpc from dimos.core.module import Module, ModuleConfig from dimos.core.stream import In -from dimos.mapping.occupancy.visualizations import turbo_image from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid from dimos.utils.logging_config import setup_logger @@ -66,8 +67,11 @@ def _on_costmap(self, grid: OccupancyGrid) -> None: self._last = now try: - bgr = turbo_image(grid.grid) # (H, W, 3) uint8 - ok, buf = cv2.imencode(".png", bgr) + # Value-preserving encoding: free/occupied 0..100 stay as-is, unknown + # (-1) → 255. The web recolors from these raw values, so we never bake + # a colormap into the upload. (H, W) uint8 → grayscale PNG. + enc = np.where(grid.grid == -1, 255, np.clip(grid.grid, 0, 100)).astype(np.uint8) + ok, buf = cv2.imencode(".png", enc) if not ok: return httpx.post( diff --git a/scripts/export_recording.py b/scripts/export_recording.py new file mode 100644 index 0000000000..7ab2bf1876 --- /dev/null +++ b/scripts/export_recording.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. + +"""Export a memory2 recording into the robomoo web app. + +Reads a SqliteStore `.db` and pushes three structured artifacts over the same +token-guarded robot-ingest endpoints the live skills use (`map_uploader.py`, +`take_picture_skill.py`): + + 1. occupancy map — top-down lidar histogram → value-encoded grayscale PNG + → POST /api/robot/map + 2. trajectory — downsampled odom path [{ts,x,y,theta}] (JSON) + → POST /api/robot/trajectory + 3. embedded frames — throttled color_image keyframes, each CLIP-embedded + (same model the web searches with) + thumbnail + pose + → POST /api/robot/frame (with an `embedding` field) + +The web then renders an interactive vector map (pan/zoom), the driven path, and +in-browser CLIP semantic search over the frames. + +Usage: + ROBOMOO_URL=http://localhost:4470 ROBOT_INGEST_TOKEN=... \ + uv run python scripts/export_recording.py recording_go2.db +""" + +from __future__ import annotations + +import argparse +import json +import math +import os + +import cv2 +import httpx +import numpy as np + +from dimos.memory2.store.sqlite import SqliteStore +from dimos.memory2.transform import downsample, throttle +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +def quat_to_yaw(qx: float, qy: float, qz: float, qw: float) -> float: + """Heading (yaw) about world +z, radians.""" + return math.atan2(2.0 * (qw * qz + qx * qy), 1.0 - 2.0 * (qy * qy + qz * qz)) + + +def quat_to_matrix(qx: float, qy: float, qz: float, qw: float) -> np.ndarray: + """3x3 rotation matrix from a (x, y, z, w) quaternion.""" + return np.array( + [ + [1 - 2 * (qy * qy + qz * qz), 2 * (qx * qy - qz * qw), 2 * (qx * qz + qy * qw)], + [2 * (qx * qy + qz * qw), 1 - 2 * (qx * qx + qz * qz), 2 * (qy * qz - qx * qw)], + [2 * (qx * qz - qy * qw), 2 * (qy * qz + qx * qw), 1 - 2 * (qx * qx + qy * qy)], + ], + dtype=np.float64, + ) + + +def post(url: str, token: str, path: str, **kwargs) -> None: + resp = httpx.post( + f"{url.rstrip('/')}{path}", + headers={"Authorization": f"Bearer {token}"}, + timeout=60.0, + **kwargs, + ) + resp.raise_for_status() + + +def build_and_push_map( + store: SqliteStore, url: str, token: str, resolution: float, every: int +) -> None: + """Accumulate lidar into a world-frame top-down occupancy and upload it.""" + if "lidar" not in store.list_streams(): + logger.warning("no lidar stream — skipping map") + return + + pts_world: list[np.ndarray] = [] + for obs in store.streams.lidar.transform(downsample(every)): + if obs.pose is None: + continue + x, y, z, qx, qy, qz, qw = obs.pose + pts = np.asarray(obs.data.points_f32(), dtype=np.float64) + if pts.size == 0: + continue + world = pts @ quat_to_matrix(qx, qy, qz, qw).T + np.array([x, y, z]) + pts_world.append(world[:, :2]) # XY only + if not pts_world: + logger.warning("no lidar points with pose — skipping map") + return + + xy = np.concatenate(pts_world, axis=0) + min_x, min_y = xy.min(axis=0) + max_x, max_y = xy.max(axis=0) + width = max(1, int(math.ceil((max_x - min_x) / resolution))) + height = max(1, int(math.ceil((max_y - min_y) / resolution))) + + cols = np.clip(((xy[:, 0] - min_x) / resolution).astype(int), 0, width - 1) + rows = np.clip(((xy[:, 1] - min_y) / resolution).astype(int), 0, height - 1) + counts = np.zeros((height, width), dtype=np.int32) + np.add.at(counts, (rows, cols), 1) + + # value-encoded grayscale (matches map_uploader.py): unknown=255, occupied=100, + # lightly-hit cells scaled, free space left unknown (transparent on the web). + enc = np.full((height, width), 255, dtype=np.uint8) + hit = counts > 0 + enc[hit] = np.clip(counts[hit] * 25, 1, 100).astype(np.uint8) + + ok, buf = cv2.imencode(".png", enc) + if not ok: + logger.warning("map PNG encode failed") + return + post( + url, + token, + "/api/robot/map", + files={"file": ("map.png", buf.tobytes(), "image/png")}, + data={ + "resolution": str(resolution), + "originX": str(min_x), + "originY": str(min_y), + "width": str(width), + "height": str(height), + }, + ) + logger.info("pushed map %dx%d (res %.3f)", width, height, resolution) + + +def push_trajectory(store: SqliteStore, url: str, token: str, interval: float) -> None: + if "odom" not in store.list_streams(): + logger.warning("no odom stream — skipping trajectory") + return + points = [] + for obs in store.streams.odom.transform(throttle(interval)): + if obs.pose is None: + continue + x, y, _z, qx, qy, qz, qw = obs.pose + points.append({"ts": obs.ts, "x": x, "y": y, "theta": quat_to_yaw(qx, qy, qz, qw)}) + if not points: + logger.warning("no odom poses — skipping trajectory") + return + blob = json.dumps(points).encode("utf-8") + post( + url, + token, + "/api/robot/trajectory", + files={"file": ("trajectory.json", blob, "application/json")}, + ) + logger.info("pushed trajectory (%d points)", len(points)) + + +def push_frames( + store: SqliteStore, url: str, token: str, interval: float, max_frames: int, thumb_w: int +) -> None: + if "color_image" not in store.list_streams(): + logger.warning("no color_image stream — skipping frames") + return + from dimos.models.embedding.clip import CLIPModel + + clip = CLIPModel() + n = 0 + for obs in store.streams.color_image.transform(throttle(interval)): + if n >= max_frames: + break + img = obs.data + emb = clip.embed(img) + vec = emb.to_numpy().astype(float).ravel().tolist() + + bgr = img.to_opencv() + h, w = bgr.shape[:2] + if w > thumb_w: + bgr = cv2.resize(bgr, (thumb_w, int(h * thumb_w / w))) + ok, buf = cv2.imencode(".jpg", bgr, [cv2.IMWRITE_JPEG_QUALITY, 80]) + if not ok: + continue + + data = {"label": "frame", "embedding": json.dumps(vec)} + if obs.pose is not None: + data["poseX"] = str(obs.pose[0]) + data["poseY"] = str(obs.pose[1]) + post( + url, + token, + "/api/robot/frame", + files={"file": ("frame.jpg", buf.tobytes(), "image/jpeg")}, + data=data, + ) + n += 1 + logger.info("pushed %d embedded frames", n) + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("db", help="path to the recording .db") + ap.add_argument("--robomoo-url", default=os.getenv("ROBOMOO_URL", "")) + ap.add_argument("--token", default=os.getenv("ROBOT_INGEST_TOKEN", "")) + ap.add_argument("--map-resolution", type=float, default=0.05) + ap.add_argument("--lidar-every", type=int, default=10, help="use every Nth lidar scan") + ap.add_argument("--traj-interval", type=float, default=0.5, help="seconds between path points") + ap.add_argument("--frame-interval", type=float, default=2.0, help="seconds between keyframes") + ap.add_argument("--max-frames", type=int, default=120) + ap.add_argument("--thumb-width", type=int, default=384) + ap.add_argument("--no-map", action="store_true") + ap.add_argument("--no-trajectory", action="store_true") + ap.add_argument("--no-frames", action="store_true") + args = ap.parse_args() + + if not args.robomoo_url or not args.token: + raise SystemExit("set ROBOMOO_URL and ROBOT_INGEST_TOKEN (or pass --robomoo-url/--token)") + + store = SqliteStore(path=args.db) + logger.info("opened %s — streams: %s", args.db, store.list_streams()) + + if not args.no_map: + build_and_push_map(store, args.robomoo_url, args.token, args.map_resolution, args.lidar_every) + if not args.no_trajectory: + push_trajectory(store, args.robomoo_url, args.token, args.traj_interval) + if not args.no_frames: + push_frames( + store, + args.robomoo_url, + args.token, + args.frame_interval, + args.max_frames, + args.thumb_width, + ) + logger.info("done") + + +if __name__ == "__main__": + main() From fd3254cb5b218f46f9647d0ddbb21a4fb6bbf67f Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 17:43:03 +0800 Subject: [PATCH 16/29] chore: add go2-start.sh hackathon quickstart Wifi/ROBOT_IP sanity check, NTP sync, venv bootstrap, and `exec dimos run ` (default `unitree-go2-basic`). Co-Authored-By: Claude Opus 4.7 (1M context) --- go2-start.sh | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100755 go2-start.sh diff --git a/go2-start.sh b/go2-start.sh new file mode 100755 index 0000000000..24afb02f34 --- /dev/null +++ b/go2-start.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# go2-start.sh — hackathon quickstart for the dimairos05 Go2 +# Usage: +# ./go2-start.sh # default blueprint: unitree-go2-basic +# ./go2-start.sh unitree-go2-agentic-gemini +# ROBOT_IP=10.0.0.42 ./go2-start.sh # override IP (robot on shared wifi) + +set -euo pipefail + +# ---- config ---------------------------------------------------------------- +ROBOT_IP="${ROBOT_IP:-192.168.12.1}" +EXPECTED_SSID="${EXPECTED_SSID:-dimairos05}" +BLUEPRINT="${1:-unitree-go2-basic}" +VENV_DIR="${VENV_DIR:-.venv}" +# ---------------------------------------------------------------------------- + +c_red() { printf '\033[31m%s\033[0m\n' "$*"; } +c_green() { printf '\033[32m%s\033[0m\n' "$*"; } +c_yellow(){ printf '\033[33m%s\033[0m\n' "$*"; } +c_bold() { printf '\033[1m%s\033[0m\n' "$*"; } + +c_bold "▶ Go2 hackathon quickstart" +echo " blueprint : $BLUEPRINT" +echo " robot ip : $ROBOT_IP" +echo + +# 1. Wifi check (macOS) ------------------------------------------------------ +if [[ "$(uname)" == "Darwin" ]]; then + SSID="$(ipconfig getsummary en0 2>/dev/null | awk -F' SSID : ' '/ SSID : /{print $2; exit}')" + if [[ -z "$SSID" ]]; then + SSID="$(ipconfig getsummary en1 2>/dev/null | awk -F' SSID : ' '/ SSID : /{print $2; exit}')" + fi + if [[ "$SSID" == "$EXPECTED_SSID" ]]; then + c_green "✓ on wifi $SSID" + else + c_yellow "⚠ current wifi is '${SSID:-unknown}', expected '$EXPECTED_SSID'" + c_yellow " if the robot is on a different network, set ROBOT_IP and ignore this" + fi +else + c_yellow "⚠ non-macOS host, skipping wifi check" +fi + +# 2. Reachability ------------------------------------------------------------ +echo +echo "→ pinging $ROBOT_IP …" +if ping -c 3 -W 1000 "$ROBOT_IP" >/dev/null 2>&1; then + c_green "✓ robot reachable" +else + c_red "✗ cannot reach $ROBOT_IP" + c_red " check: joined dimairos05? robot powered on? sport mode on in the Unitree app?" + exit 1 +fi + +# 3. Clock sync (Unitree video desyncs without this) ------------------------- +echo +echo "→ syncing clock (sudo, may prompt for password) …" +if command -v sntp >/dev/null 2>&1; then + sudo sntp -sS pool.ntp.org >/dev/null 2>&1 \ + && c_green "✓ clock synced" \ + || c_yellow "⚠ clock sync failed (non-fatal, but video may lag lidar)" +elif command -v ntpdate >/dev/null 2>&1; then + sudo ntpdate pool.ntp.org >/dev/null 2>&1 \ + && c_green "✓ clock synced" \ + || c_yellow "⚠ clock sync failed (non-fatal)" +else + c_yellow "⚠ no sntp/ntpdate found, skipping clock sync" +fi + +# 4. Venv -------------------------------------------------------------------- +echo +if [[ -d "$VENV_DIR" ]]; then + c_green "✓ using existing venv $VENV_DIR" +else + c_yellow "no venv at $VENV_DIR — creating one" + if ! command -v uv >/dev/null 2>&1; then + c_red "✗ uv is not installed. install: brew install uv (or curl -LsSf https://astral.sh/uv/install.sh | sh)" + exit 1 + fi + uv venv --python 3.12 "$VENV_DIR" + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + uv pip install 'dimos[base,unitree]' +fi +# shellcheck disable=SC1091 +source "$VENV_DIR/bin/activate" + +# 5. Env vars ---------------------------------------------------------------- +export ROBOT_IP +# Agentic-gemini blueprint needs these; warn if missing for that path +if [[ "$BLUEPRINT" == *"gemini"* && -z "${GOOGLE_API_KEY:-}" ]]; then + c_yellow "⚠ GOOGLE_API_KEY not set — gemini blueprint will fail when the agent runs" +fi +if [[ "$BLUEPRINT" == *"agentic"* && "$BLUEPRINT" != *"ollama"* && "$BLUEPRINT" != *"gemini"* ]]; then + if [[ -z "${OPENAI_API_KEY:-}" ]]; then + c_yellow "⚠ OPENAI_API_KEY not set — agentic blueprint will fail when the agent runs" + fi +fi + +# 6. Launch ------------------------------------------------------------------ +echo +c_bold "▶ launching: dimos run $BLUEPRINT" +echo " command center → http://localhost:7779" +echo " ctrl-c to stop" +echo +exec dimos run "$BLUEPRINT" From 3af952e0cb0e0479727b5bc0f0884efa35f2f477 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 17:46:09 +0800 Subject: [PATCH 17/29] feat(web): MJPEG + snapshot bridge for color_image CameraMjpegModule (`camera-mjpeg-module`) republishes the `color_image` stream as an HTTP MJPEG feed and a single-JPEG snapshot, with CORS open: GET /video_feed/color_image multipart/x-mixed-replace GET /snapshot/color_image image/jpeg Default port 7780. Compose with any blueprint that publishes `color_image` (sim or real Go2): dimos --simulation run unitree-go2-basic camera-mjpeg-module Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/robot/all_blueprints.py | 1 + dimos/web/mjpeg_module.py | 97 +++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 dimos/web/mjpeg_module.py diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index eea47ae4c4..0bfed73845 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -128,6 +128,7 @@ "arm-teleop-module": "dimos.teleop.quest.quest_extensions.ArmTeleopModule", "b-box-navigation-module": "dimos.navigation.bbox_navigation.BBoxNavigationModule", "b1-connection-module": "dimos.robot.unitree.b1.connection.B1ConnectionModule", + "camera-mjpeg-module": "dimos.web.mjpeg_module.CameraMjpegModule", "camera-module": "dimos.hardware.sensors.camera.module.CameraModule", "cartesian-motion-controller": "dimos.manipulation.control.servo_control.cartesian_motion_controller.CartesianMotionController", "control-coordinator": "dimos.control.coordinator.ControlCoordinator", diff --git a/dimos/web/mjpeg_module.py b/dimos/web/mjpeg_module.py new file mode 100644 index 0000000000..ccac9200f8 --- /dev/null +++ b/dimos/web/mjpeg_module.py @@ -0,0 +1,97 @@ +"""Expose color_image as an MJPEG HTTP stream + single-frame snapshot. + +Usage: + dimos --simulation run unitree-go2-basic camera-mjpeg-module + # MJPEG: http://127.0.0.1:7780/video_feed/color_image + # snapshot: http://127.0.0.1:7780/snapshot/color_image +""" + +import threading +from typing import Any + +import cv2 +from fastapi import HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import Response +import numpy as np +import reactivex as rx +from reactivex import operators as ops + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In +from dimos.msgs.sensor_msgs.Image import Image +from dimos.utils.logging_config import setup_logger +from dimos.web.robot_web_interface import RobotWebInterface + +logger = setup_logger() + + +class CameraMjpegConfig(ModuleConfig): + port: int = 7780 + stream_key: str = "color_image" + + +class CameraMjpegModule(Module): + config: CameraMjpegConfig + color_image: In[Image] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._subject: rx.subject.Subject = rx.subject.Subject() + self._web: RobotWebInterface | None = None + self._thread: threading.Thread | None = None + self._latest_jpeg: bytes | None = None + self._latest_lock = threading.Lock() + + @rpc + def start(self) -> None: + super().start() + self._web = RobotWebInterface( + port=self.config.port, + **{self.config.stream_key: self._subject.pipe(ops.share())}, + ) + self._web.app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["GET"], + allow_headers=["*"], + ) + + stream_key = self.config.stream_key + + @self._web.app.get(f"/snapshot/{stream_key}") + def snapshot() -> Response: + with self._latest_lock: + buf = self._latest_jpeg + if buf is None: + raise HTTPException(status_code=503, detail="no frame yet") + return Response(content=buf, media_type="image/jpeg") + + self._thread = threading.Thread(target=self._web.run, daemon=True) + self._thread.start() + self.color_image.subscribe(self._on_image) + logger.info( + f"MJPEG: http://127.0.0.1:{self.config.port}/video_feed/{stream_key} " + f"| snapshot: http://127.0.0.1:{self.config.port}/snapshot/{stream_key}" + ) + + def _on_image(self, img: Image) -> None: + arr = img.as_numpy() + if arr.ndim == 3 and arr.shape[2] == 3: + arr = cv2.cvtColor(arr, cv2.COLOR_RGB2BGR) + self._subject.on_next(arr) + + ok, jpg = cv2.imencode(".jpg", arr) + if ok: + with self._latest_lock: + self._latest_jpeg = np.asarray(jpg).tobytes() + + @rpc + def stop(self) -> None: + if self._web is not None: + try: + self._web.shutdown() + except Exception: + pass + super().stop() From 86421bc75e2cacf91d6be9beec646ad95bb18570 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 17:46:35 +0800 Subject: [PATCH 18/29] feat(unitree): Go2 audio I/O and audio-ws web bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds full bidirectional audio support for the real Go2: - UnitreeWebRTCConnection.audio_stream() emits AudioMessage (int16 PCM + sample_rate + channels) by hooking the existing WebRTC audio transceiver and activating it via switchAudioChannel. Float frames are scaled before int16 cast. - UnitreeWebRTCConnection.play_wav_bytes(wav) uploads + plays a WAV through the audiohub megaphone, then exits megaphone mode after the clip's duration so subsequent commands work normally. - GO2Connection exposes audio: Out[AudioMessage] and audio_in: In[bytes], plus two skills: play_wav(wav_path) — local-filesystem WAV play_wav_b64(wav_b64) — base64 WAV for remote MCP clients - New AudioWsModule (`audio-ws-module`) bridges to the browser: WebSocket /audio_out binary PCM frames GET /audio_info sample_rate + channels (initial WS frame also broadcasts {"event":"format", ...}) POST /play WAV body for robot speaker Simulation has no audio; the wiring is hasattr-guarded so unitree-go2-basic still works in sim. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/robot/all_blueprints.py | 1 + dimos/robot/unitree/connection.py | 109 +++++++++++++++++++++ dimos/robot/unitree/go2/connection.py | 46 ++++++++- dimos/web/audio_ws_module.py | 136 ++++++++++++++++++++++++++ 4 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 dimos/web/audio_ws_module.py diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 0bfed73845..8adc202001 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -126,6 +126,7 @@ all_modules = { "alfred-high-level": "dimos.robot.diy.alfred.effector_high_level.AlfredHighLevel", "arm-teleop-module": "dimos.teleop.quest.quest_extensions.ArmTeleopModule", + "audio-ws-module": "dimos.web.audio_ws_module.AudioWsModule", "b-box-navigation-module": "dimos.navigation.bbox_navigation.BBoxNavigationModule", "b1-connection-module": "dimos.robot.unitree.b1.connection.B1ConnectionModule", "camera-mjpeg-module": "dimos.web.mjpeg_module.CameraMjpegModule", diff --git a/dimos/robot/unitree/connection.py b/dimos/robot/unitree/connection.py index 44101cc19d..a1fb5e55a7 100644 --- a/dimos/robot/unitree/connection.py +++ b/dimos/robot/unitree/connection.py @@ -15,9 +15,13 @@ import asyncio from dataclasses import dataclass import functools +import io +import os +import tempfile import threading import time from typing import Any, TypeAlias, TypeVar +import wave import numpy as np from numpy.typing import NDArray @@ -54,6 +58,15 @@ VideoMessage: TypeAlias = NDArray[np.uint8] # Shape: (height, width, 3) +@dataclass +class AudioMessage: + """PCM audio chunk received from the robot mic.""" + + data: bytes # int16 little-endian PCM + sample_rate: int + channels: int + + _T = TypeVar("_T", bound=Timestamped) @@ -405,6 +418,102 @@ def switch_video_channel_off() -> None: return subject.pipe(ops.finally_action(stop)) + @simple_mcache + def audio_stream(self) -> Observable[AudioMessage]: + """Subscribe to the robot's mic via WebRTC. + + Emits int16 PCM chunks. The Go2's WebRTC audio transceiver is sendrecv; + first frame triggers channel activation. + """ + subject: Subject[AudioMessage] = Subject() + stop_event = threading.Event() + + async def accept_track(frame: Any) -> None: + if stop_event.is_set(): + return + try: + arr = frame.to_ndarray() + rate = getattr(frame, "sample_rate", 48000) + # aiortc returns shape (channels, samples) for planar; assume mono if 1-D. + channels = arr.shape[0] if arr.ndim == 2 else 1 + if np.issubdtype(arr.dtype, np.floating): + arr = np.clip(arr * 32767.0, -32768, 32767).astype(np.int16) + elif arr.dtype != np.int16: + arr = arr.astype(np.int16) + subject.on_next( + AudioMessage(data=arr.tobytes(), sample_rate=rate, channels=channels) + ) + except Exception: + pass + + self.conn.audio.add_track_callback(accept_track) + + def switch_on() -> None: + try: + self.conn.audio.switchAudioChannel(True) + except Exception: + pass + + self.loop.call_soon_threadsafe(switch_on) + + def stop() -> None: + stop_event.set() + try: + self.conn.audio.track_callbacks.remove(accept_track) + except ValueError: + pass + + def switch_off() -> None: + try: + self.conn.audio.switchAudioChannel(False) + except Exception: + pass + + self.loop.call_soon_threadsafe(switch_off) + + return subject.pipe(ops.finally_action(stop)) + + def play_wav_bytes(self, wav: bytes) -> None: + """Play a WAV through the Go2 speaker via the megaphone path. + + Fire-and-forget — returns immediately. The Go2 enters megaphone mode + for the duration of the clip then exits, so subsequent commands are + unaffected. + """ + if not wav: + return + + try: + with wave.open(io.BytesIO(wav), "rb") as wf: + duration = wf.getnframes() / float(wf.getframerate()) + except (wave.Error, EOFError): + duration = 5.0 # unknown format; bail out conservatively + + async def _upload_play_exit() -> None: + from unitree_webrtc_connect.webrtc_audiohub import WebRTCAudioHub + + tmp_path: str | None = None + try: + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: + fp.write(wav) + tmp_path = fp.name + + hub = WebRTCAudioHub(self.conn) + await hub.upload_megaphone(tmp_path) + await hub.enter_megaphone() + # Hold megaphone for the clip's duration, plus a small flush margin, + # then release so other commands work normally. + await asyncio.sleep(duration + 0.5) + await hub.exit_megaphone() + finally: + if tmp_path is not None: + try: + os.unlink(tmp_path) + except OSError: + pass + + asyncio.run_coroutine_threadsafe(_upload_play_exit(), self.loop) + def get_video_stream(self, fps: int = 30) -> Observable[Image]: """Get the video stream from the robot's camera. diff --git a/dimos/robot/unitree/go2/connection.py b/dimos/robot/unitree/go2/connection.py index 5568a473ef..e472c39bdb 100644 --- a/dimos/robot/unitree/go2/connection.py +++ b/dimos/robot/unitree/go2/connection.py @@ -48,7 +48,7 @@ from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo from dimos.msgs.sensor_msgs.Image import Image from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 -from dimos.robot.unitree.connection import UnitreeWebRTCConnection +from dimos.robot.unitree.connection import AudioMessage, UnitreeWebRTCConnection from dimos.utils.decorators.decorators import cached_property, simple_mcache if sys.version_info < (3, 13): @@ -207,6 +207,8 @@ class GO2Connection(Module, Camera, Pointcloud): lidar: Out[PointCloud2] color_image: Out[Image] camera_info: Out[CameraInfo] + audio: Out[AudioMessage] + audio_in: In[bytes] connection: Go2ConnectionProtocol camera_info_static: CameraInfo = _camera_info_static() @@ -246,6 +248,19 @@ def onimage(image: Image) -> None: self.register_disposable(self.connection.video_stream().subscribe(onimage)) self.register_disposable(Disposable(self.cmd_vel.subscribe(self.move))) + if hasattr(self.connection, "audio_stream"): + try: + self.register_disposable( + self.connection.audio_stream().subscribe(self.audio.publish) + ) + except Exception as e: + logger.warning(f"audio_stream not started: {e}") + + if hasattr(self.connection, "play_wav_bytes"): + self.register_disposable( + Disposable(self.audio_in.subscribe(self.connection.play_wav_bytes)) + ) + self._camera_info_thread = Thread( target=self.publish_camera_info, daemon=True, @@ -352,6 +367,35 @@ def publish_request(self, topic: str, data: dict[str, Any]) -> dict[Any, Any]: """ return self.connection.publish_request(topic, data) + @skill + def play_wav(self, wav_path: str) -> str: + """Play a WAV file through the robot's speaker (megaphone mode). + + Args: + wav_path: Path on the dimos host filesystem. Use `play_wav_b64` + instead when the agent runs on a different machine. + """ + if not hasattr(self.connection, "play_wav_bytes"): + return "play_wav unavailable on this connection (simulation?)" + with open(wav_path, "rb") as f: + self.connection.play_wav_bytes(f.read()) + return f"queued {wav_path} for playback" + + @skill + def play_wav_b64(self, wav_b64: str) -> str: + """Play a base64-encoded WAV through the robot's speaker. + + Use this from remote MCP clients that don't share the dimos host's + filesystem. 44.1 kHz mono WAVs play most reliably. + """ + import base64 + + if not hasattr(self.connection, "play_wav_bytes"): + return "play_wav unavailable on this connection (simulation?)" + wav = base64.b64decode(wav_b64) + self.connection.play_wav_bytes(wav) + return f"queued {len(wav)} bytes for playback" + @skill def observe(self) -> Image | None: """Returns the latest video frame from the robot camera. Use this skill for any visual world queries. diff --git a/dimos/web/audio_ws_module.py b/dimos/web/audio_ws_module.py new file mode 100644 index 0000000000..a1c3ba360c --- /dev/null +++ b/dimos/web/audio_ws_module.py @@ -0,0 +1,136 @@ +"""Bridge robot audio to a web client. + + - WebSocket /audio_out -> sends int16 PCM frames from the Go2 mic + - GET /audio_info -> reports the current sample_rate + channels (so + consumers know how to decode /audio_out frames) + - POST /play -> uploads a WAV body, plays it on the Go2 speaker + +Compose with `unitree-go2-basic`: + + dimos run unitree-go2-basic audio-ws-module + +Then: + ws://127.0.0.1:7781/audio_out (mic, binary frames) + curl http://127.0.0.1:7781/audio_info (sample_rate, channels) + curl --data-binary @clip.wav http://127.0.0.1:7781/play +""" + +import asyncio +import threading +from typing import Any + +from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect +from fastapi.middleware.cors import CORSMiddleware +import uvicorn + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.robot.unitree.connection import AudioMessage +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class AudioWsConfig(ModuleConfig): + port: int = 7781 + + +class AudioWsModule(Module): + config: AudioWsConfig + audio: In[AudioMessage] # robot mic (from GO2Connection.audio) + audio_in: Out[bytes] # WAV bytes for robot speaker (-> GO2Connection.audio_in) + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._app = FastAPI() + self._app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["GET", "POST"], + allow_headers=["*"], + ) + self._clients: set[WebSocket] = set() + self._loop: asyncio.AbstractEventLoop | None = None + self._server: uvicorn.Server | None = None + self._thread: threading.Thread | None = None + self._last_format: dict[str, int] | None = None + self._setup_routes() + + def _setup_routes(self) -> None: + @self._app.get("/audio_info") + def audio_info() -> dict[str, Any]: + if self._last_format is None: + return {"status": "no frame yet"} + return {"status": "ok", **self._last_format} + + @self._app.websocket("/audio_out") + async def audio_out(ws: WebSocket) -> None: + await ws.accept() + if self._last_format is not None: + await ws.send_json({"event": "format", **self._last_format}) + self._clients.add(ws) + try: + while True: + # Treat any incoming message as keep-alive; ignore content. + await ws.receive() + except WebSocketDisconnect: + pass + finally: + self._clients.discard(ws) + + @self._app.post("/play") + async def play(req: Request) -> dict[str, str]: + wav = await req.body() + if not wav: + return {"status": "empty"} + self.audio_in.publish(wav) + return {"status": "queued", "bytes": str(len(wav))} + + @rpc + def start(self) -> None: + super().start() + + self.audio.subscribe(self._on_audio) + + def run() -> None: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + config = uvicorn.Config( + self._app, host="127.0.0.1", port=self.config.port, log_level="warning" + ) + self._server = uvicorn.Server(config) + self._loop.run_until_complete(self._server.serve()) + + self._thread = threading.Thread(target=run, daemon=True) + self._thread.start() + logger.info( + f"audio-ws-module: ws://127.0.0.1:{self.config.port}/audio_out " + f"| GET http://127.0.0.1:{self.config.port}/audio_info " + f"| POST http://127.0.0.1:{self.config.port}/play" + ) + + def _on_audio(self, msg: AudioMessage) -> None: + self._last_format = {"sample_rate": msg.sample_rate, "channels": msg.channels} + if not self._clients or self._loop is None: + return + + chunk = msg.data + + async def broadcast() -> None: + stale: list[WebSocket] = [] + for ws in list(self._clients): + try: + await ws.send_bytes(chunk) + except Exception: + stale.append(ws) + for ws in stale: + self._clients.discard(ws) + + asyncio.run_coroutine_threadsafe(broadcast(), self._loop) + + @rpc + def stop(self) -> None: + if self._server is not None: + self._server.should_exit = True + super().stop() From 22ef9a14c0a6e3d9ef263ced44f0abf3106d886f Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 17:46:44 +0800 Subject: [PATCH 19/29] docs(journal): camera + audio web bridges Co-Authored-By: Claude Opus 4.7 (1M context) --- journal/2026-05-27-camera-audio-bridges.md | 109 +++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 journal/2026-05-27-camera-audio-bridges.md diff --git a/journal/2026-05-27-camera-audio-bridges.md b/journal/2026-05-27-camera-audio-bridges.md new file mode 100644 index 0000000000..0b562d816b --- /dev/null +++ b/journal/2026-05-27-camera-audio-bridges.md @@ -0,0 +1,109 @@ +# 2026-05-27 — camera + audio web bridges, Go2 audio scaffold + +Branch: `feat/gemini-go2-2245` + +## Goal +Stream the Go2 (sim or real) camera into a web page consumable by mlxvlm, +then add the same shape for robot audio (mic out, speaker in). + +## Camera + +### `dimos/web/mjpeg_module.py` (new) — `camera-mjpeg-module` +- Module subscribes to `color_image: In[Image]` and republishes via FastAPI on + `:7780`. +- Endpoints: + - `GET /video_feed/color_image` — MJPEG (`multipart/x-mixed-replace`) + - `GET /snapshot/color_image` — single JPEG (for backend pull, e.g. mlxvlm + `/api/analyze` server-side) +- CORS allow-all so a browser at a different origin can `drawImage` onto a + canvas without tainting. +- Default port moved from 5555 → 7780 to avoid common collisions. + +### `dimos/robot/all_blueprints.py` +- Registered `"camera-mjpeg-module": "dimos.web.mjpeg_module.CameraMjpegModule"`. + +### Use +``` +dimos --simulation run unitree-go2-basic camera-mjpeg-module +# http://127.0.0.1:7780/video_feed/color_image +# http://127.0.0.1:7780/snapshot/color_image +``` +Same URLs work against the real Go2 — drop `--simulation`, set `ROBOT_IP`. + +## Audio (real-robot only — sim has no audio) + +### `dimos/robot/unitree/connection.py` (modified) +- Added `AudioMessage(data: bytes, sample_rate: int, channels: int)` dataclass. +- `UnitreeWebRTCConnection.audio_stream()` — subscribes to + `LegionConnection.audio` via `add_track_callback`, calls + `switchAudioChannel(True)`, emits `AudioMessage` (int16 PCM at the frame's + native rate, usually 48 kHz mono). +- `UnitreeWebRTCConnection.play_wav_bytes(wav: bytes)` — writes a tempfile, + uploads via `WebRTCAudioHub.upload_megaphone`, then `enter_megaphone`. Fires + on the connection's asyncio loop; returns immediately. + +### `dimos/robot/unitree/go2/connection.py` (modified) +- New streams on `GO2Connection`: + - `audio: Out[bytes]` — mic data (currently only the PCM bytes — see + review issue #1, metadata is dropped). + - `audio_in: In[bytes]` — WAV bytes to play through the speaker. +- `start()` wires both if the connection supports them (`hasattr` guard for + sim). +- New skill `play_wav(wav_path: str)` — agent-facing, reads local WAV and + invokes `play_wav_bytes`. + +### `dimos/web/audio_ws_module.py` (new) — `audio-ws-module` +- Module with `audio: In[bytes]` (mic) and `audio_in: Out[bytes]` (speaker). +- FastAPI on `:7781`: + - `WebSocket /audio_out` — pushes binary PCM frames to connected clients. + - `POST /play` — body = WAV bytes; publishes to `audio_in` for the robot + speaker. +- CORS open on the HTTP routes. + +### `dimos/robot/all_blueprints.py` +- Registered `"audio-ws-module": "dimos.web.audio_ws_module.AudioWsModule"`. + +### Use (real Go2 only) +``` +dimos run unitree-go2-basic audio-ws-module +# ws://127.0.0.1:7781/audio_out (mic out, binary) +# curl --data-binary @clip.wav http://127.0.0.1:7781/play +``` + +## Other changes + +### `go2-start.sh` +- Quickstart script for the dimairos05 Go2 — wifi check, ping, NTP sync, venv + bootstrap, env-var sanity, `exec dimos run `. Useful at the + hackathon table. + +### Editable install +- Switched the venv from a one-shot `uv pip install 'dimos[sim] @ .'` to + `uv pip install -e '.[sim]'` so source edits apply without reinstall. + +## Review fixes applied +- `GO2Connection.audio` retyped `Out[AudioMessage]`; rate + channels carry + through to `audio-ws-module`, which now exposes them via `GET /audio_info` and + sends an initial `{"event":"format", ...}` JSON frame on WS connect. +- `play_wav_bytes` reads WAV duration via `wave`, sleeps for it, then calls + `exit_megaphone` so the robot doesn't stay in megaphone mode. +- Float frames from aiortc are now scaled `(arr * 32767).clip(...)` before + the int16 cast instead of being silently truncated. +- Added `play_wav_b64` skill alongside `play_wav` for remote MCP clients + that don't share the dimos host's filesystem. +- Moved `os` / `tempfile` / `wave` imports to module top. + +## Known issues / follow-ups +1. Stereo packed-vs-planar layout — `arr.shape[0]` assumes planar; Go2 mic + is mono so OK in practice. Document, don't fix yet. +2. `audio-ws-module` alone (without `unitree-go2-basic`) has an unbound `In` — + autoconnect doesn't warn loudly enough. +3. Untested: no real Go2 available during this session. Sim has no audio. + +## Composition cheatsheet +| Run | Endpoints | +|---|---| +| `unitree-go2-basic camera-mjpeg-module` | `:7780/video_feed/color_image`, `:7780/snapshot/color_image` | +| `unitree-go2-basic audio-ws-module` | `ws://:7781/audio_out`, `POST :7781/play` | +| `unitree-go2-basic camera-mjpeg-module audio-ws-module` | both | +| `unitree-go2-agentic camera-mjpeg-module audio-ws-module` | MCP + agent + both bridges | From f1266de7f1f6eaf4a2a8169eb545cf7d0ac6bd31 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 17:48:44 +0800 Subject: [PATCH 20/29] chore(go2-start): launch camera + audio bridges by default; add SIMULATION mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - BRIDGES=1 (default) appends `camera-mjpeg-module audio-ws-module` to the dimos run argv so the web endpoints come up automatically. - SIMULATION=1 swaps in `--simulation`, skips wifi/ping/NTP checks, installs the `sim` extra on first venv bootstrap. - EXTRA="…" lets callers tack on additional modules. - Banner prints every endpoint that will be available (command center, MJPEG, snapshot, audio WS, audio play). Co-Authored-By: Claude Opus 4.7 (1M context) --- go2-start.sh | 142 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 97 insertions(+), 45 deletions(-) diff --git a/go2-start.sh b/go2-start.sh index 24afb02f34..f25e99af7e 100755 --- a/go2-start.sh +++ b/go2-start.sh @@ -1,9 +1,19 @@ #!/usr/bin/env bash # go2-start.sh — hackathon quickstart for the dimairos05 Go2 +# # Usage: -# ./go2-start.sh # default blueprint: unitree-go2-basic -# ./go2-start.sh unitree-go2-agentic-gemini -# ROBOT_IP=10.0.0.42 ./go2-start.sh # override IP (robot on shared wifi) +# ./go2-start.sh # unitree-go2-basic + web bridges +# ./go2-start.sh unitree-go2-agentic-gemini # any blueprint +# ROBOT_IP=10.0.0.42 ./go2-start.sh # override IP +# BRIDGES=0 ./go2-start.sh # skip camera-mjpeg + audio-ws +# EXTRA="foo-module bar-module" ./go2-start.sh # tack on more modules +# SIMULATION=1 ./go2-start.sh # run MuJoCo sim (camera only) +# +# Default extras (with BRIDGES=1): +# camera-mjpeg-module http://127.0.0.1:7780/video_feed/color_image +# http://127.0.0.1:7780/snapshot/color_image +# audio-ws-module ws://127.0.0.1:7781/audio_out (robot only) +# POST http://127.0.0.1:7781/play (robot only) set -euo pipefail @@ -12,6 +22,9 @@ ROBOT_IP="${ROBOT_IP:-192.168.12.1}" EXPECTED_SSID="${EXPECTED_SSID:-dimairos05}" BLUEPRINT="${1:-unitree-go2-basic}" VENV_DIR="${VENV_DIR:-.venv}" +BRIDGES="${BRIDGES:-1}" +EXTRA="${EXTRA:-}" +SIMULATION="${SIMULATION:-0}" # ---------------------------------------------------------------------------- c_red() { printf '\033[31m%s\033[0m\n' "$*"; } @@ -19,51 +32,68 @@ c_green() { printf '\033[32m%s\033[0m\n' "$*"; } c_yellow(){ printf '\033[33m%s\033[0m\n' "$*"; } c_bold() { printf '\033[1m%s\033[0m\n' "$*"; } +# Assemble the module list once so it's reused for echo + exec. +MODULES=("$BLUEPRINT") +if [[ "$BRIDGES" == "1" ]]; then + MODULES+=("camera-mjpeg-module" "audio-ws-module") +fi +if [[ -n "$EXTRA" ]]; then + # shellcheck disable=SC2206 + MODULES+=( $EXTRA ) +fi + c_bold "▶ Go2 hackathon quickstart" -echo " blueprint : $BLUEPRINT" -echo " robot ip : $ROBOT_IP" +echo " modules : ${MODULES[*]}" +if [[ "$SIMULATION" == "1" ]]; then + echo " mode : simulation (no robot)" +else + echo " robot ip : $ROBOT_IP" +fi echo -# 1. Wifi check (macOS) ------------------------------------------------------ -if [[ "$(uname)" == "Darwin" ]]; then - SSID="$(ipconfig getsummary en0 2>/dev/null | awk -F' SSID : ' '/ SSID : /{print $2; exit}')" - if [[ -z "$SSID" ]]; then - SSID="$(ipconfig getsummary en1 2>/dev/null | awk -F' SSID : ' '/ SSID : /{print $2; exit}')" - fi - if [[ "$SSID" == "$EXPECTED_SSID" ]]; then - c_green "✓ on wifi $SSID" +if [[ "$SIMULATION" != "1" ]]; then + # 1. Wifi check (macOS) ---------------------------------------------------- + if [[ "$(uname)" == "Darwin" ]]; then + SSID="$(ipconfig getsummary en0 2>/dev/null | awk -F' SSID : ' '/ SSID : /{print $2; exit}')" + if [[ -z "$SSID" ]]; then + SSID="$(ipconfig getsummary en1 2>/dev/null | awk -F' SSID : ' '/ SSID : /{print $2; exit}')" + fi + if [[ "$SSID" == "$EXPECTED_SSID" ]]; then + c_green "✓ on wifi $SSID" + else + c_yellow "⚠ current wifi is '${SSID:-unknown}', expected '$EXPECTED_SSID'" + c_yellow " if the robot is on a different network, set ROBOT_IP and ignore this" + fi else - c_yellow "⚠ current wifi is '${SSID:-unknown}', expected '$EXPECTED_SSID'" - c_yellow " if the robot is on a different network, set ROBOT_IP and ignore this" + c_yellow "⚠ non-macOS host, skipping wifi check" fi -else - c_yellow "⚠ non-macOS host, skipping wifi check" -fi -# 2. Reachability ------------------------------------------------------------ -echo -echo "→ pinging $ROBOT_IP …" -if ping -c 3 -W 1000 "$ROBOT_IP" >/dev/null 2>&1; then - c_green "✓ robot reachable" -else - c_red "✗ cannot reach $ROBOT_IP" - c_red " check: joined dimairos05? robot powered on? sport mode on in the Unitree app?" - exit 1 -fi + # 2. Reachability ---------------------------------------------------------- + echo + echo "→ pinging $ROBOT_IP …" + if ping -c 3 -W 1000 "$ROBOT_IP" >/dev/null 2>&1; then + c_green "✓ robot reachable" + else + c_red "✗ cannot reach $ROBOT_IP" + c_red " check: joined dimairos05? robot powered on? sport mode on in the Unitree app?" + c_red " (use SIMULATION=1 ./go2-start.sh to run MuJoCo instead)" + exit 1 + fi -# 3. Clock sync (Unitree video desyncs without this) ------------------------- -echo -echo "→ syncing clock (sudo, may prompt for password) …" -if command -v sntp >/dev/null 2>&1; then - sudo sntp -sS pool.ntp.org >/dev/null 2>&1 \ - && c_green "✓ clock synced" \ - || c_yellow "⚠ clock sync failed (non-fatal, but video may lag lidar)" -elif command -v ntpdate >/dev/null 2>&1; then - sudo ntpdate pool.ntp.org >/dev/null 2>&1 \ - && c_green "✓ clock synced" \ - || c_yellow "⚠ clock sync failed (non-fatal)" -else - c_yellow "⚠ no sntp/ntpdate found, skipping clock sync" + # 3. Clock sync (Unitree video desyncs without this) ----------------------- + echo + echo "→ syncing clock (sudo, may prompt for password) …" + if command -v sntp >/dev/null 2>&1; then + sudo sntp -sS pool.ntp.org >/dev/null 2>&1 \ + && c_green "✓ clock synced" \ + || c_yellow "⚠ clock sync failed (non-fatal, but video may lag lidar)" + elif command -v ntpdate >/dev/null 2>&1; then + sudo ntpdate pool.ntp.org >/dev/null 2>&1 \ + && c_green "✓ clock synced" \ + || c_yellow "⚠ clock sync failed (non-fatal)" + else + c_yellow "⚠ no sntp/ntpdate found, skipping clock sync" + fi fi # 4. Venv -------------------------------------------------------------------- @@ -79,7 +109,11 @@ else uv venv --python 3.12 "$VENV_DIR" # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" - uv pip install 'dimos[base,unitree]' + if [[ "$SIMULATION" == "1" ]]; then + uv pip install 'dimos[base,unitree,sim]' + else + uv pip install 'dimos[base,unitree]' + fi fi # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" @@ -98,8 +132,26 @@ fi # 6. Launch ------------------------------------------------------------------ echo -c_bold "▶ launching: dimos run $BLUEPRINT" -echo " command center → http://localhost:7779" +c_bold "▶ endpoints" +echo " command center : http://localhost:7779" +if [[ "$BRIDGES" == "1" ]]; then + echo " camera MJPEG : http://127.0.0.1:7780/video_feed/color_image" + echo " camera snapshot: http://127.0.0.1:7780/snapshot/color_image" + if [[ "$SIMULATION" == "1" ]]; then + echo " audio : (robot only — sim has no audio)" + else + echo " audio out (ws) : ws://127.0.0.1:7781/audio_out" + echo " audio info : http://127.0.0.1:7781/audio_info" + echo " audio play : POST http://127.0.0.1:7781/play" + fi +fi +echo +c_bold "▶ launching: dimos $([[ "$SIMULATION" == "1" ]] && echo "--simulation ")run ${MODULES[*]}" echo " ctrl-c to stop" echo -exec dimos run "$BLUEPRINT" + +if [[ "$SIMULATION" == "1" ]]; then + exec dimos --simulation run "${MODULES[@]}" +else + exec dimos run "${MODULES[@]}" +fi From 66a6b51ce1493ffcd2e9df3ad885178bfcd8d106 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 18:11:52 +0800 Subject: [PATCH 21/29] feat(go2-start): LM Studio + mlxvlm Gemma-4 local LLM presets + sim wrapper go2-start.sh - LMSTUDIO=1 routes McpClient to LM Studio's OpenAI-compatible server at http://127.0.0.1:1234/v1 (LMSTUDIO_MODEL=qwen/qwen3-8b by default). - MLXVLM=1 routes to the mlxvlm Gemma-4 server at http://127.0.0.1:8080/v1 (MLXVLM_MODEL defaults to gemma-4-E4B-it-MLX-4bit). - Probes /v1/models before launch; bails with a useful hint if the backend isn't up. - Both presets set OPENAI_BASE_URL + OPENAI_API_KEY and pass `-o mcpclient.model=openai:` to dimos run. - Warns when an LLM preset is set against a non-agentic blueprint. sim-with-llm.sh - One-liner: `./sim-with-llm.sh mlxvlm` runs the sim with the agentic blueprint pointed at mlxvlm; `lmstudio` and `ollama` variants too. Co-Authored-By: Claude Opus 4.7 (1M context) --- go2-start.sh | 90 ++++++++++++++++++++++++++++++++++++++++++------- sim-with-llm.sh | 37 ++++++++++++++++++++ 2 files changed, 114 insertions(+), 13 deletions(-) create mode 100755 sim-with-llm.sh diff --git a/go2-start.sh b/go2-start.sh index f25e99af7e..538992016f 100755 --- a/go2-start.sh +++ b/go2-start.sh @@ -2,12 +2,21 @@ # go2-start.sh — hackathon quickstart for the dimairos05 Go2 # # Usage: -# ./go2-start.sh # unitree-go2-basic + web bridges -# ./go2-start.sh unitree-go2-agentic-gemini # any blueprint -# ROBOT_IP=10.0.0.42 ./go2-start.sh # override IP -# BRIDGES=0 ./go2-start.sh # skip camera-mjpeg + audio-ws -# EXTRA="foo-module bar-module" ./go2-start.sh # tack on more modules -# SIMULATION=1 ./go2-start.sh # run MuJoCo sim (camera only) +# ./go2-start.sh # unitree-go2-basic + web bridges +# ./go2-start.sh unitree-go2-agentic-gemini # any blueprint +# ROBOT_IP=10.0.0.42 ./go2-start.sh # override IP +# BRIDGES=0 ./go2-start.sh # skip camera-mjpeg + audio-ws +# EXTRA="foo-module bar-module" ./go2-start.sh # tack on more modules +# SIMULATION=1 ./go2-start.sh # MuJoCo sim (camera only, no audio) +# +# Local LLM presets — pick exactly one (default model can be overridden): +# LMSTUDIO=1 ./go2-start.sh unitree-go2-agentic +# -> OpenAI-compat at http://127.0.0.1:1234/v1 +# -> LMSTUDIO_MODEL=qwen/qwen3-8b (override if a different model is loaded) +# +# MLXVLM=1 ./go2-start.sh unitree-go2-agentic +# -> OpenAI-compat at http://127.0.0.1:8080/v1 (mlxvlm Gemma-4 server) +# -> MLXVLM_MODEL=lmstudio-community/gemma-4-E4B-it-MLX-4bit # # Default extras (with BRIDGES=1): # camera-mjpeg-module http://127.0.0.1:7780/video_feed/color_image @@ -25,6 +34,10 @@ VENV_DIR="${VENV_DIR:-.venv}" BRIDGES="${BRIDGES:-1}" EXTRA="${EXTRA:-}" SIMULATION="${SIMULATION:-0}" +LMSTUDIO="${LMSTUDIO:-0}" +MLXVLM="${MLXVLM:-0}" +LMSTUDIO_MODEL="${LMSTUDIO_MODEL:-qwen/qwen3-8b}" +MLXVLM_MODEL="${MLXVLM_MODEL:-lmstudio-community/gemma-4-E4B-it-MLX-4bit}" # ---------------------------------------------------------------------------- c_red() { printf '\033[31m%s\033[0m\n' "$*"; } @@ -32,6 +45,12 @@ c_green() { printf '\033[32m%s\033[0m\n' "$*"; } c_yellow(){ printf '\033[33m%s\033[0m\n' "$*"; } c_bold() { printf '\033[1m%s\033[0m\n' "$*"; } +# Sanity: only one LLM preset at a time. +if [[ "$LMSTUDIO" == "1" && "$MLXVLM" == "1" ]]; then + c_red "✗ pick one: LMSTUDIO=1 or MLXVLM=1, not both" + exit 1 +fi + # Assemble the module list once so it's reused for echo + exec. MODULES=("$BLUEPRINT") if [[ "$BRIDGES" == "1" ]]; then @@ -42,6 +61,29 @@ if [[ -n "$EXTRA" ]]; then MODULES+=( $EXTRA ) fi +# Optional LLM endpoint override + model selection for the McpClient. +LLM_ARGS=() +LLM_NAME="" +LLM_URL="" +LLM_MODEL="" +if [[ "$LMSTUDIO" == "1" ]]; then + LLM_NAME="LM Studio" + LLM_URL="http://127.0.0.1:1234/v1" + LLM_MODEL="$LMSTUDIO_MODEL" +elif [[ "$MLXVLM" == "1" ]]; then + LLM_NAME="mlxvlm (Gemma 4)" + LLM_URL="http://127.0.0.1:8080/v1" + LLM_MODEL="$MLXVLM_MODEL" +fi +if [[ -n "$LLM_URL" ]]; then + export OPENAI_BASE_URL="$LLM_URL" + export OPENAI_API_KEY="${OPENAI_API_KEY:-local-llm}" # placeholder; servers ignore it + LLM_ARGS=( -o "mcpclient.model=openai:$LLM_MODEL" ) + if [[ "$BLUEPRINT" != *"agentic"* ]]; then + c_yellow "⚠ LLM preset is set but blueprint '$BLUEPRINT' has no McpClient — override is a no-op" + fi +fi + c_bold "▶ Go2 hackathon quickstart" echo " modules : ${MODULES[*]}" if [[ "$SIMULATION" == "1" ]]; then @@ -49,6 +91,10 @@ if [[ "$SIMULATION" == "1" ]]; then else echo " robot ip : $ROBOT_IP" fi +if [[ -n "$LLM_NAME" ]]; then + echo " LLM : $LLM_NAME → $LLM_URL" + echo " model : $LLM_MODEL" +fi echo if [[ "$SIMULATION" != "1" ]]; then @@ -96,7 +142,24 @@ if [[ "$SIMULATION" != "1" ]]; then fi fi -# 4. Venv -------------------------------------------------------------------- +# 4. LLM endpoint reachability check ---------------------------------------- +if [[ -n "$LLM_URL" ]]; then + echo + echo "→ probing $LLM_NAME at $LLM_URL/models …" + if curl -fsS -m 3 "$LLM_URL/models" >/dev/null 2>&1; then + c_green "✓ $LLM_NAME reachable" + else + c_red "✗ cannot reach $LLM_NAME at $LLM_URL" + if [[ "$LMSTUDIO" == "1" ]]; then + c_red " start LM Studio's Local Server (Cmd-Shift-2) and load a tool-capable model" + else + c_red " start mlxvlm: cd /Users/tex/repos/ai/mlx/mlxvlm && scripts/start-all.sh" + fi + exit 1 + fi +fi + +# 5. Venv -------------------------------------------------------------------- echo if [[ -d "$VENV_DIR" ]]; then c_green "✓ using existing venv $VENV_DIR" @@ -118,19 +181,20 @@ fi # shellcheck disable=SC1091 source "$VENV_DIR/bin/activate" -# 5. Env vars ---------------------------------------------------------------- +# 6. Env vars ---------------------------------------------------------------- export ROBOT_IP # Agentic-gemini blueprint needs these; warn if missing for that path if [[ "$BLUEPRINT" == *"gemini"* && -z "${GOOGLE_API_KEY:-}" ]]; then c_yellow "⚠ GOOGLE_API_KEY not set — gemini blueprint will fail when the agent runs" fi -if [[ "$BLUEPRINT" == *"agentic"* && "$BLUEPRINT" != *"ollama"* && "$BLUEPRINT" != *"gemini"* ]]; then +if [[ -z "$LLM_URL" && "$BLUEPRINT" == *"agentic"* && "$BLUEPRINT" != *"ollama"* && "$BLUEPRINT" != *"gemini"* ]]; then if [[ -z "${OPENAI_API_KEY:-}" ]]; then c_yellow "⚠ OPENAI_API_KEY not set — agentic blueprint will fail when the agent runs" + c_yellow " (or set LMSTUDIO=1 / MLXVLM=1 to use a local LLM)" fi fi -# 6. Launch ------------------------------------------------------------------ +# 7. Launch ------------------------------------------------------------------ echo c_bold "▶ endpoints" echo " command center : http://localhost:7779" @@ -146,12 +210,12 @@ if [[ "$BRIDGES" == "1" ]]; then fi fi echo -c_bold "▶ launching: dimos $([[ "$SIMULATION" == "1" ]] && echo "--simulation ")run ${MODULES[*]}" +c_bold "▶ launching: dimos $([[ "$SIMULATION" == "1" ]] && echo "--simulation ")run ${MODULES[*]} ${LLM_ARGS[*]:-}" echo " ctrl-c to stop" echo if [[ "$SIMULATION" == "1" ]]; then - exec dimos --simulation run "${MODULES[@]}" + exec dimos --simulation run "${MODULES[@]}" "${LLM_ARGS[@]}" else - exec dimos run "${MODULES[@]}" + exec dimos run "${MODULES[@]}" "${LLM_ARGS[@]}" fi diff --git a/sim-with-llm.sh b/sim-with-llm.sh new file mode 100755 index 0000000000..ce151d796b --- /dev/null +++ b/sim-with-llm.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# sim-with-llm.sh — sim + agentic + local LLM, one command. +# +# Usage: +# ./sim-with-llm.sh # mlxvlm Gemma-4 (default) +# ./sim-with-llm.sh lmstudio # LM Studio +# ./sim-with-llm.sh mlxvlm qwen3 # mlxvlm with a different loaded model +# BLUEPRINT=unitree-go2-agentic-ollama ./sim-with-llm.sh ollama +# +# Forwards to go2-start.sh with SIMULATION=1 and the right preset. + +set -euo pipefail + +BACKEND="${1:-mlxvlm}" +shift || true + +BLUEPRINT="${BLUEPRINT:-unitree-go2-agentic}" + +case "$BACKEND" in + mlxvlm) + export MLXVLM=1 + if [[ -n "${1:-}" ]]; then export MLXVLM_MODEL="$1"; fi + ;; + lmstudio) + export LMSTUDIO=1 + if [[ -n "${1:-}" ]]; then export LMSTUDIO_MODEL="$1"; fi + ;; + ollama) + BLUEPRINT="unitree-go2-agentic-ollama" + ;; + *) + echo "unknown backend: $BACKEND (use mlxvlm | lmstudio | ollama)" >&2 + exit 2 + ;; +esac + +exec env SIMULATION=1 ./go2-start.sh "$BLUEPRINT" From 97c3c0481778cb81ddd48e539d5ba8a9da3301fe Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 18:22:48 +0800 Subject: [PATCH 22/29] fix(sim-with-llm): default to Mac-friendly unitree-go2-agentic-gemini MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit unitree-go2-agentic pulls in SecurityModule (EdgeTAM/CUDA), local Moondream VL (crashes Metal), and OpenAI-hardcoded TTS — none of which boot on Apple Silicon. unitree-go2-agentic-gemini is the existing Mac skeleton that disables those and uses Gemini for VL/embeddings/TTS. The chat LLM override (LMSTUDIO/MLXVLM) still applies — it's just the McpClient. Surface a GOOGLE_API_KEY warning early since Gemini VL/embed/TTS fail at runtime without it. Co-Authored-By: Claude Opus 4.7 (1M context) --- sim-with-llm.sh | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/sim-with-llm.sh b/sim-with-llm.sh index ce151d796b..1c326a43ce 100755 --- a/sim-with-llm.sh +++ b/sim-with-llm.sh @@ -1,20 +1,32 @@ #!/usr/bin/env bash -# sim-with-llm.sh — sim + agentic + local LLM, one command. +# sim-with-llm.sh — sim + Mac-friendly agentic + local LLM, one command. +# +# Defaults to `unitree-go2-agentic-gemini` because it's the only agentic +# variant that boots on Apple Silicon: it disables SecurityModule (EdgeTAM +# needs CUDA), swaps Moondream (crashes Metal) → Gemini for VL, and replaces +# the OpenAI-hardcoded TTS with Gemini TTS. We only override its *chat* LLM +# (the McpClient) to point at LM Studio or mlxvlm. The VL/embedding/TTS +# pieces still need GOOGLE_API_KEY. # # Usage: -# ./sim-with-llm.sh # mlxvlm Gemma-4 (default) +# ./sim-with-llm.sh # mlxvlm Gemma-4 (default chat LLM) # ./sim-with-llm.sh lmstudio # LM Studio # ./sim-with-llm.sh mlxvlm qwen3 # mlxvlm with a different loaded model -# BLUEPRINT=unitree-go2-agentic-ollama ./sim-with-llm.sh ollama +# ./sim-with-llm.sh ollama # local ollama (uses native ollama blueprint) +# +# Override the base blueprint if you need a different shape: +# BLUEPRINT=unitree-go2-basic ./sim-with-llm.sh lmstudio # no agent, just streams # -# Forwards to go2-start.sh with SIMULATION=1 and the right preset. +# Required env (for VL/embedding/TTS on Mac): +# GOOGLE_API_KEY=… # https://aistudio.google.com/app/apikey set -euo pipefail BACKEND="${1:-mlxvlm}" shift || true -BLUEPRINT="${BLUEPRINT:-unitree-go2-agentic}" +# Mac-friendly default: no CUDA, no local Moondream, OpenAI-TTS disabled. +BLUEPRINT="${BLUEPRINT:-unitree-go2-agentic-gemini}" case "$BACKEND" in mlxvlm) @@ -34,4 +46,11 @@ case "$BACKEND" in ;; esac +# The gemini-shaped blueprint still needs GOOGLE_API_KEY for VL/embeddings/TTS +# even when the chat LLM is local. Surface that early instead of crashing later. +if [[ "$BLUEPRINT" == *"gemini"* && -z "${GOOGLE_API_KEY:-}" ]]; then + printf '\033[33m⚠ GOOGLE_API_KEY not set — Gemini VL/embeddings/TTS will fail at runtime.\033[0m\n' + printf '\033[33m Get one at https://aistudio.google.com/app/apikey, then re-run.\033[0m\n' +fi + exec env SIMULATION=1 ./go2-start.sh "$BLUEPRINT" From 3b83498f646776d0433c9a0d5b141dc6e1efca79 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 18:34:26 +0800 Subject: [PATCH 23/29] fix(sim-with-llm): avoid google-genai import; minimal Mac+LM-Studio compose unitree-go2-agentic-gemini imports GeminiSpeakSkill at module load (before any --disable can apply), and google-genai isn't installed, so the gemini blueprint crashed at import. Switch the default compose to: unitree-go2-basic (camera + viz, no Google imports) + mcp-server + mcp-client (MCP agent loop) + unitree-skill-container (wait / current_time / sport / tilt_body) + camera-mjpeg-module + audio-ws-module (from BRIDGES=1) For chat: LMSTUDIO=1 or MLXVLM=1 reroutes McpClient as before. For ollama: keep the existing unitree-go2-agentic-ollama (clean Mac compose). Trade-off: no relative_move skill (needs the nav stack). Publish to /cmd_vel directly if movement is needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- sim-with-llm.sh | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sim-with-llm.sh b/sim-with-llm.sh index 1c326a43ce..9abaceec10 100755 --- a/sim-with-llm.sh +++ b/sim-with-llm.sh @@ -25,8 +25,15 @@ set -euo pipefail BACKEND="${1:-mlxvlm}" shift || true -# Mac-friendly default: no CUDA, no local Moondream, OpenAI-TTS disabled. -BLUEPRINT="${BLUEPRINT:-unitree-go2-agentic-gemini}" +# Pick a blueprint compose that imports cleanly on Apple Silicon without +# google-genai. unitree-go2-agentic-gemini imports GeminiSpeakSkill at +# module load (before --disable can take effect), so we don't use it here. +# unitree-go2-basic + mcp-server + mcp-client gives the full agent loop; +# unitree-skill-container adds wait / current_time / execute_sport_command +# / tilt_body skills. relative_move needs the nav stack and is not in this +# compose — publish to /cmd_vel directly if you need movement. +BLUEPRINT="${BLUEPRINT:-unitree-go2-basic}" +export EXTRA="${EXTRA:-mcp-server mcp-client unitree-skill-container}" case "$BACKEND" in mlxvlm) @@ -38,7 +45,9 @@ case "$BACKEND" in if [[ -n "${1:-}" ]]; then export LMSTUDIO_MODEL="$1"; fi ;; ollama) + # The ollama agentic blueprint is already a clean Mac compose. BLUEPRINT="unitree-go2-agentic-ollama" + export EXTRA="" ;; *) echo "unknown backend: $BACKEND (use mlxvlm | lmstudio | ollama)" >&2 @@ -46,11 +55,4 @@ case "$BACKEND" in ;; esac -# The gemini-shaped blueprint still needs GOOGLE_API_KEY for VL/embeddings/TTS -# even when the chat LLM is local. Surface that early instead of crashing later. -if [[ "$BLUEPRINT" == *"gemini"* && -z "${GOOGLE_API_KEY:-}" ]]; then - printf '\033[33m⚠ GOOGLE_API_KEY not set — Gemini VL/embeddings/TTS will fail at runtime.\033[0m\n' - printf '\033[33m Get one at https://aistudio.google.com/app/apikey, then re-run.\033[0m\n' -fi - exec env SIMULATION=1 ./go2-start.sh "$BLUEPRINT" From 277321332c2205718267d173863bc15b92225179 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 18:37:10 +0800 Subject: [PATCH 24/29] fix(sim-with-llm): drop unitree-skill-container from minimal default UnitreeSkillContainer requires a NavigationInterfaceSpec module (replanning-a-star-planner + nav-stack chain). Without it, build fails: "No module met that spec." Minimal default is now just: unitree-go2-basic + mcp-server + mcp-client + bridges Skills available: observe, play_wav, play_wav_b64. Header documents how to layer in unitree-skill-container + replanning-a-star-planner when movement skills are needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- sim-with-llm.sh | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sim-with-llm.sh b/sim-with-llm.sh index 9abaceec10..991ddf1752 100755 --- a/sim-with-llm.sh +++ b/sim-with-llm.sh @@ -25,15 +25,17 @@ set -euo pipefail BACKEND="${1:-mlxvlm}" shift || true -# Pick a blueprint compose that imports cleanly on Apple Silicon without -# google-genai. unitree-go2-agentic-gemini imports GeminiSpeakSkill at -# module load (before --disable can take effect), so we don't use it here. -# unitree-go2-basic + mcp-server + mcp-client gives the full agent loop; -# unitree-skill-container adds wait / current_time / execute_sport_command -# / tilt_body skills. relative_move needs the nav stack and is not in this -# compose — publish to /cmd_vel directly if you need movement. +# Minimal Mac+local-LLM compose that imports cleanly without google-genai: +# unitree-go2-basic (camera + 3D viz) +# + mcp-server + mcp-client (the MCP agent loop) +# Skills exposed (from GO2Connection): observe, play_wav, play_wav_b64. +# +# To add movement / sport / tilt skills, also pass: +# EXTRA="mcp-server mcp-client unitree-skill-container replanning-a-star-planner" +# (the planner satisfies UnitreeSkillContainer's NavigationInterfaceSpec dep, +# and pulls in the nav-stack chain — heavier but enables relative_move). BLUEPRINT="${BLUEPRINT:-unitree-go2-basic}" -export EXTRA="${EXTRA:-mcp-server mcp-client unitree-skill-container}" +export EXTRA="${EXTRA:-mcp-server mcp-client}" case "$BACKEND" in mlxvlm) From d62128d2c1dcded053eff7b224c78d0cd32dccd9 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 18:41:38 +0800 Subject: [PATCH 25/29] docs(journal): Mac + local-LLM agentic landscape Document the blueprint compatibility matrix on Apple Silicon, the skill availability per compose, and the VL backend gotcha (qwen is Alibaba cloud, not local MLX; moondream crashes Metal; gemini is Google cloud). Notes the missing piece: a Mac-local VL backend (mlxvlm/openai_compat) that would unlock the richer NavigationSkill / PersonFollow containers. Co-Authored-By: Claude Opus 4.7 (1M context) --- journal/2026-05-27-camera-audio-bridges.md | 54 ++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/journal/2026-05-27-camera-audio-bridges.md b/journal/2026-05-27-camera-audio-bridges.md index 0b562d816b..9c03d04122 100644 --- a/journal/2026-05-27-camera-audio-bridges.md +++ b/journal/2026-05-27-camera-audio-bridges.md @@ -100,6 +100,60 @@ dimos run unitree-go2-basic audio-ws-module autoconnect doesn't warn loudly enough. 3. Untested: no real Go2 available during this session. Sim has no audio. +## Mac + local-LLM agentic — what works, what doesn't + +Goal: drive an MCP agent on Apple Silicon with LM Studio (:1234) or the +mlxvlm Gemma-4 server (:8080), no CUDA, no Google services. + +### Blueprints + +| Blueprint | Boots on Mac? | Notes | +|---|---|---| +| `unitree-go2-agentic` | ✗ | imports SecurityModule (EdgeTAM/CUDA) and Moondream VL (crashes Metal) | +| `unitree-go2-agentic-gemini` | ✗ as-is | imports `GeminiSpeakSkill` → `from google import genai` at module load. Run `uv pip install google-genai` first, even if you don't call any Gemini APIs. | +| `unitree-go2-agentic-ollama` | ✓ | clean compose; `./sim-with-llm.sh ollama` uses it | +| `unitree-go2-basic + mcp-server + mcp-client` | ✓ | minimal but reliable; default for `./sim-with-llm.sh lmstudio` and `mlxvlm` | + +### Skill availability by compose + +| Compose | Skills the agent gets | +|---|---| +| `unitree-go2-basic + mcp-server + mcp-client` | `observe`, `play_wav`, `play_wav_b64` (last two no-op in sim) | +| `… + unitree-skill-container + replanning-a-star-planner` | adds `relative_move`, `wait`, `current_time`, `tilt_body`, `execute_sport_command` — heavier startup (full nav stack) | + +`unitree-skill-container` alone fails build: +`"No module met NavigationInterfaceSpec spec"`. The implementor is +`replanning-a-star-planner`. + +### VL backends — Mac gotcha + +`dimos/models/vl/types.py:17` literal: `qwen | moondream | gemini`. + +- `qwen` — **not local**: hits Alibaba DashScope cloud (`dashscope-intl.aliyuncs.com`), needs `ALIBABA_API_KEY`. +- `moondream` — local; crashes Metal on Apple Silicon (per the `-gemini` blueprint docstring). +- `gemini` — Google cloud, needs `GOOGLE_API_KEY`. + +There is currently **no Mac-local VL backend**. The user has mlxvlm on :8080 +(Gemma-4-E4B + Falcon-Perception, OpenAI-compatible) — natural place to plug +a new `mlxvlm` / `openai_compat` backend. Not implemented yet; needs a +new file under `dimos/models/vl/`. + +### LLM chat backends + +- **LM Studio at :1234** and **mlxvlm at :8080** both speak `/v1/chat/completions` + with proper OpenAI tool-call passthrough — `langchain-openai` (already + installed) handles them via `OPENAI_BASE_URL`. No new dimos module + needed for chat. +- McpClient model override via `-o mcpclient.model=openai:` works. + Wired up in `go2-start.sh` LMSTUDIO/MLXVLM presets. + +### Next iteration +Add `mlxvlm` to `VlModelName` so `NavigationSkillContainer` and +`PersonFollowSkillContainer` can run with a local VL backend. With that ++ `google-genai` installed, `unitree-go2-agentic-gemini` becomes the +richest Mac agentic compose if we also `--disable` the Gemini-runtime +modules (`gemini-speak-skill`, `map-uploader`, `spatial-memory`). + ## Composition cheatsheet | Run | Endpoints | |---|---| From c3dee5a85d132f6ad75f247ca09519b1a8d934dc Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 20:23:05 +0800 Subject: [PATCH 26/29] fix(audio-ws): use Server.run() + queue handoff; surface bind errors The previous start() did `loop.run_until_complete(server.serve())` from a daemon thread and used `run_coroutine_threadsafe` to broadcast audio frames, which races the loop startup and silently raises "Event loop stopped before Future completed" when uvicorn shuts down or fails to bind. Now: - uvicorn.Server.run() owns its own loop and lifecycle. - Subscriber thread enqueues frames; a coroutine inside the uvicorn loop (spawned at FastAPI startup) drains the queue and fans out to WebSocket clients. - Bind errors (port already in use) are logged with the actual OSError and a kill hint, instead of a generic "loop stopped". - Drops oldest frame on overflow so latency stays bounded. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/web/audio_ws_module.py | 81 +++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/dimos/web/audio_ws_module.py b/dimos/web/audio_ws_module.py index a1c3ba360c..8f54d0a57f 100644 --- a/dimos/web/audio_ws_module.py +++ b/dimos/web/audio_ws_module.py @@ -16,6 +16,7 @@ """ import asyncio +import queue import threading from typing import Any @@ -50,11 +51,13 @@ def __init__(self, **kwargs: Any) -> None: allow_methods=["GET", "POST"], allow_headers=["*"], ) + # Cross-thread handoff: subscriber thread enqueues, broadcast task drains. + self._frame_queue: queue.Queue[bytes] = queue.Queue(maxsize=200) self._clients: set[WebSocket] = set() - self._loop: asyncio.AbstractEventLoop | None = None self._server: uvicorn.Server | None = None self._thread: threading.Thread | None = None self._last_format: dict[str, int] | None = None + self._stop_event = threading.Event() self._setup_routes() def _setup_routes(self) -> None: @@ -72,7 +75,7 @@ async def audio_out(ws: WebSocket) -> None: self._clients.add(ws) try: while True: - # Treat any incoming message as keep-alive; ignore content. + # Server -> client only; sink any inbound messages. await ws.receive() except WebSocketDisconnect: pass @@ -87,6 +90,29 @@ async def play(req: Request) -> dict[str, str]: self.audio_in.publish(wav) return {"status": "queued", "bytes": str(len(wav))} + @self._app.on_event("startup") + async def _spawn_broadcaster() -> None: + asyncio.create_task(self._broadcast_loop()) + + async def _broadcast_loop(self) -> None: + """Drain the cross-thread queue and fan out to WebSocket clients.""" + loop = asyncio.get_running_loop() + while not self._stop_event.is_set(): + try: + chunk = await loop.run_in_executor(None, self._frame_queue.get, True, 0.5) + except queue.Empty: + continue + if not self._clients: + continue + stale: list[WebSocket] = [] + for ws in list(self._clients): + try: + await ws.send_bytes(chunk) + except Exception: + stale.append(ws) + for ws in stale: + self._clients.discard(ws) + @rpc def start(self) -> None: super().start() @@ -94,15 +120,28 @@ def start(self) -> None: self.audio.subscribe(self._on_audio) def run() -> None: - self._loop = asyncio.new_event_loop() - asyncio.set_event_loop(self._loop) config = uvicorn.Config( - self._app, host="127.0.0.1", port=self.config.port, log_level="warning" + self._app, + host="127.0.0.1", + port=self.config.port, + log_level="warning", + lifespan="on", ) self._server = uvicorn.Server(config) - self._loop.run_until_complete(self._server.serve()) - - self._thread = threading.Thread(target=run, daemon=True) + try: + # Server.run() owns its own asyncio loop and cleanup; this + # avoids the run_until_complete/run_coroutine_threadsafe race + # that produced "Event loop stopped before Future completed". + self._server.run() + except OSError as e: + logger.error( + f"audio-ws failed to bind :{self.config.port} ({e}); " + f"is another instance running? `lsof -ti :{self.config.port} | xargs kill -9`" + ) + except Exception: + logger.exception("audio-ws server crashed") + + self._thread = threading.Thread(target=run, daemon=True, name="audio-ws-uvicorn") self._thread.start() logger.info( f"audio-ws-module: ws://127.0.0.1:{self.config.port}/audio_out " @@ -112,25 +151,19 @@ def run() -> None: def _on_audio(self, msg: AudioMessage) -> None: self._last_format = {"sample_rate": msg.sample_rate, "channels": msg.channels} - if not self._clients or self._loop is None: - return - - chunk = msg.data - - async def broadcast() -> None: - stale: list[WebSocket] = [] - for ws in list(self._clients): - try: - await ws.send_bytes(chunk) - except Exception: - stale.append(ws) - for ws in stale: - self._clients.discard(ws) - - asyncio.run_coroutine_threadsafe(broadcast(), self._loop) + try: + self._frame_queue.put_nowait(msg.data) + except queue.Full: + # Drop oldest to bound latency. + try: + self._frame_queue.get_nowait() + self._frame_queue.put_nowait(msg.data) + except queue.Empty: + pass @rpc def stop(self) -> None: + self._stop_event.set() if self._server is not None: self._server.should_exit = True super().stop() From b626d280b2b9b9a095daadc084e3af119995a896 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 20:26:03 +0800 Subject: [PATCH 27/29] =?UTF-8?q?feat(web):=20cmd-bridge-module=20?= =?UTF-8?q?=E2=80=94=20HTTP=20cmd=5Fvel/path/pose=20bridge=20+=20mlxvlm=20?= =?UTF-8?q?brief?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cmd-bridge-module exposes the robot's drive interface over HTTP so an external VLM (mlxvlm Gemma-4) can run a perceive-act loop: POST /cmd_vel one Twist + duration POST /path sequence of Twist steps (raw or semantic forward/left/degrees) POST /stop cancel any in-flight /path GET /pose current base_link pose in world frame Open-loop (no SLAM, no obstacle avoidance) so it works in sim and on the bare-metal robot without the nav stack. The VLM is expected to re-plan each iteration from a fresh camera frame. journal/2026-05-27-mlxvlm-robot-integration-prompt.md is the brief for the Claude working on the mlxvlm side: endpoint contract, the `/api/robot/navigate` perceive-act loop, JSON schema for the model's per-step reply, failure modes, and sim-only testing instructions. Co-Authored-By: Claude Opus 4.7 (1M context) --- dimos/robot/all_blueprints.py | 1 + dimos/web/cmd_bridge_module.py | 222 ++++++++++++++++++ ...6-05-27-mlxvlm-robot-integration-prompt.md | 123 ++++++++++ 3 files changed, 346 insertions(+) create mode 100644 dimos/web/cmd_bridge_module.py create mode 100644 journal/2026-05-27-mlxvlm-robot-integration-prompt.md diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 8adc202001..196c84dd94 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -132,6 +132,7 @@ "camera-mjpeg-module": "dimos.web.mjpeg_module.CameraMjpegModule", "camera-module": "dimos.hardware.sensors.camera.module.CameraModule", "cartesian-motion-controller": "dimos.manipulation.control.servo_control.cartesian_motion_controller.CartesianMotionController", + "cmd-bridge-module": "dimos.web.cmd_bridge_module.CmdBridgeModule", "control-coordinator": "dimos.control.coordinator.ControlCoordinator", "cost-mapper": "dimos.mapping.costmapper.CostMapper", "demo-calculator-skill": "dimos.agents.skills.demo_calculator_skill.DemoCalculatorSkill", diff --git a/dimos/web/cmd_bridge_module.py b/dimos/web/cmd_bridge_module.py new file mode 100644 index 0000000000..266808bfa9 --- /dev/null +++ b/dimos/web/cmd_bridge_module.py @@ -0,0 +1,222 @@ +"""HTTP bridge for driving the robot from an external process. + + - POST /cmd_vel -> publish a single Twist for `duration` seconds (then 0) + - POST /path -> execute a sequence of relative moves in order + - POST /stop -> emergency zero-Twist + - GET /pose -> latest base_link pose in the world frame + +Compose with any blueprint that has GO2Connection (sim or real robot): + + dimos --simulation run unitree-go2-basic cmd-bridge-module + +The "path" is a list of relative steps that are sent as raw cmd_vel +Twists for a per-step duration, then a stop. This is open-loop — no +SLAM, no obstacle avoidance — but works in sim and on the bare-metal +robot without requiring the nav stack. Good fit for a VLM that decides +the next short step from a camera frame and re-plans every iteration. +""" + +import asyncio +import math +import queue +import threading +import time +from typing import Any + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field +import uvicorn + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Twist import Twist +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class CmdBridgeConfig(ModuleConfig): + port: int = 7782 + + +class TwistRequest(BaseModel): + linear: list[float] = Field(default=[0.0, 0.0, 0.0], min_length=3, max_length=3) + angular: list[float] = Field(default=[0.0, 0.0, 0.0], min_length=3, max_length=3) + duration: float = 0.5 # seconds to hold the command before stopping + + +class PathStep(BaseModel): + """One step of an open-loop path. + + `linear`/`angular` are the same Twist components as `/cmd_vel`. The step + runs for `duration` seconds, then the next step starts. If the model + prefers semantic steps, use `forward`/`left`/`degrees` and let the bridge + convert. + """ + + linear: list[float] | None = None + angular: list[float] | None = None + # Semantic alternative: distances in meters, rotation in degrees, + # executed over `duration` seconds. Converted to Twist by dividing. + forward: float | None = None + left: float | None = None + degrees: float | None = None + duration: float = 1.0 + + +class PathRequest(BaseModel): + steps: list[PathStep] + + +class CmdBridgeModule(Module): + """Open-loop HTTP -> cmd_vel bridge.""" + + config: CmdBridgeConfig + cmd_vel: Out[Twist] + odom: In[PoseStamped] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._app = FastAPI() + self._app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["GET", "POST"], + allow_headers=["*"], + ) + self._latest_pose: PoseStamped | None = None + self._server: uvicorn.Server | None = None + self._thread: threading.Thread | None = None + # Serialize POSTs so /path doesn't interleave with /cmd_vel. + self._drive_lock = threading.Lock() + # Cooperative cancel for an in-flight /path when /stop arrives. + self._cancel_event = threading.Event() + self._setup_routes() + + def _setup_routes(self) -> None: + @self._app.get("/pose") + def pose() -> dict[str, Any]: + if self._latest_pose is None: + return {"status": "no pose yet"} + p = self._latest_pose + q = p.orientation + # Yaw from quaternion (assumes near-zero roll/pitch, fine for a quadruped). + siny = 2.0 * (q.w * q.z + q.x * q.y) + cosy = 1.0 - 2.0 * (q.y * q.y + q.z * q.z) + theta = math.atan2(siny, cosy) + return { + "status": "ok", + "x": p.position.x, + "y": p.position.y, + "z": p.position.z, + "theta": theta, + "ts": p.ts, + } + + @self._app.post("/cmd_vel") + def cmd_vel(req: TwistRequest) -> dict[str, Any]: + with self._drive_lock: + self._cancel_event.clear() + self._publish_twist(req.linear, req.angular) + self._sleep_or_cancel(req.duration) + self._publish_twist([0, 0, 0], [0, 0, 0]) + return {"status": "ok"} + + @self._app.post("/path") + def path(req: PathRequest) -> dict[str, Any]: + with self._drive_lock: + self._cancel_event.clear() + executed = 0 + for step in req.steps: + if self._cancel_event.is_set(): + break + linear, angular = _step_to_twist(step) + self._publish_twist(linear, angular) + if self._sleep_or_cancel(step.duration): + break + executed += 1 + self._publish_twist([0, 0, 0], [0, 0, 0]) + return { + "status": "cancelled" if self._cancel_event.is_set() else "ok", + "executed": executed, + "total": len(req.steps), + } + + @self._app.post("/stop") + def stop() -> dict[str, str]: + self._cancel_event.set() + self._publish_twist([0, 0, 0], [0, 0, 0]) + return {"status": "stopped"} + + def _publish_twist(self, linear: list[float], angular: list[float]) -> None: + self.cmd_vel.publish( + Twist( + linear=Vector3(linear[0], linear[1], linear[2]), + angular=Vector3(angular[0], angular[1], angular[2]), + ts=time.time(), + ) + ) + + def _sleep_or_cancel(self, duration: float) -> bool: + """Sleep up to `duration`, returns True if cancelled mid-sleep.""" + return self._cancel_event.wait(timeout=max(0.0, duration)) + + def _on_odom(self, pose: PoseStamped) -> None: + self._latest_pose = pose + + @rpc + def start(self) -> None: + super().start() + self.register_disposable(self.odom.subscribe(self._on_odom)) + + def run() -> None: + config = uvicorn.Config( + self._app, + host="127.0.0.1", + port=self.config.port, + log_level="warning", + lifespan="on", + ) + self._server = uvicorn.Server(config) + try: + self._server.run() + except OSError as e: + logger.error( + f"cmd-bridge failed to bind :{self.config.port} ({e}); " + f"another instance? `lsof -ti :{self.config.port} | xargs kill -9`" + ) + except Exception: + logger.exception("cmd-bridge server crashed") + + self._thread = threading.Thread(target=run, daemon=True, name="cmd-bridge-uvicorn") + self._thread.start() + logger.info( + f"cmd-bridge-module: POST http://127.0.0.1:{self.config.port}/cmd_vel " + f"| POST /path | POST /stop | GET /pose" + ) + + @rpc + def stop(self) -> None: + self._cancel_event.set() + self._publish_twist([0, 0, 0], [0, 0, 0]) + if self._server is not None: + self._server.should_exit = True + super().stop() + + +def _step_to_twist(step: PathStep) -> tuple[list[float], list[float]]: + """Convert a PathStep to (linear, angular) Twist components.""" + if step.linear is not None or step.angular is not None: + return ( + step.linear or [0.0, 0.0, 0.0], + step.angular or [0.0, 0.0, 0.0], + ) + dur = max(step.duration, 1e-3) + fwd = (step.forward or 0.0) / dur + lat = (step.left or 0.0) / dur + yaw = math.radians(step.degrees or 0.0) / dur + return [fwd, lat, 0.0], [0.0, 0.0, yaw] diff --git a/journal/2026-05-27-mlxvlm-robot-integration-prompt.md b/journal/2026-05-27-mlxvlm-robot-integration-prompt.md new file mode 100644 index 0000000000..e5b0c343c1 --- /dev/null +++ b/journal/2026-05-27-mlxvlm-robot-integration-prompt.md @@ -0,0 +1,123 @@ +# mlxvlm → dimos: drive the Go2 from a VLM + +Use this as the prompt for the Claude working on `/Users/tex/repos/ai/mlx/mlxvlm`. + +## Context + +dimos is the robot OS. It runs as a separate process on the same Mac and +exposes three OpenAI/HTTP-shaped bridges on localhost so mlxvlm can plug in +without speaking any internal dimos APIs: + +| URL | Direction | Format | +|---|---|---| +| `GET http://127.0.0.1:7780/video_feed/color_image` | dimos → you | MJPEG (`multipart/x-mixed-replace; boundary=frame`) | +| `GET http://127.0.0.1:7780/snapshot/color_image` | dimos → you | one JPEG | +| `GET ws://127.0.0.1:7781/audio_out` | dimos → you | binary int16 PCM frames; first WS message is `{"event":"format","sample_rate":N,"channels":N}` | +| `GET http://127.0.0.1:7781/audio_info` | dimos → you | `{sample_rate,channels}` JSON | +| `POST http://127.0.0.1:7781/play` | you → dimos | body = raw WAV bytes; plays on robot speaker (no-op in sim) | +| `POST http://127.0.0.1:7782/cmd_vel` | you → dimos | one Twist step | +| `POST http://127.0.0.1:7782/path` | you → dimos | sequence of Twist steps, executed in order | +| `POST http://127.0.0.1:7782/stop` | you → dimos | zero-Twist emergency stop | +| `GET http://127.0.0.1:7782/pose` | dimos → you | `{x, y, z, theta, ts}` of base_link in world frame | + +All endpoints are CORS-open. Endpoints stay the same whether dimos is in +MuJoCo sim or driving a real Go2 — only audio is robot-only. + +## Twist + Path semantics + +`cmd_vel` is a standard ROS-style Twist: linear `[x_forward, y_left, z]` in +m/s, angular `[x_roll, y_pitch, z_yaw]` in rad/s. Open-loop: the bridge +publishes the Twist, sleeps for `duration` seconds, then publishes zero. + +`/path` takes a list of steps. Two ways to spell a step: + +1. **Raw Twist** — the model emits velocity components: + ```json + {"linear":[0.3,0,0],"angular":[0,0,0],"duration":1.0} + ``` + +2. **Semantic** — the model emits distances in m / rotation in deg, and the + bridge divides by `duration` to make a Twist: + ```json + {"forward":0.5,"left":0,"degrees":0,"duration":2.0} + {"forward":0,"left":0,"degrees":-90,"duration":1.5} + ``` + +Steps run sequentially. `/stop` cancels an in-flight `/path` cooperatively. + +No SLAM, no obstacle avoidance — for closed-loop, **re-plan** every iteration +from a fresh camera frame instead of sending long paths. + +## What to implement in mlxvlm + +Add a `POST /api/robot/navigate` route to `app.py` that runs a perceive-act +loop until the goal is reached, the user cancels, or `max_steps` is hit: + +``` +input: { "goal": "find the red chair and stop in front of it", + "max_steps": 12, + "step_seconds": 1.0 } + +loop: + 1. frame = httpx.get(DIMOS_CAMERA_URL).content + 2. pose = httpx.get(DIMOS_POSE_URL).json() + 3. ask Gemma-4-Falcon-Perception via the existing /api/analyze pipeline: + "Camera frame attached. Robot pose: {pose}. Goal: {goal}. + Decide the next ≤2 second movement. Reply ONLY with strict JSON: + { 'action':'move'|'stop'|'arrived', + 'forward': float meters, + 'left': float meters, + 'degrees': float rotation, + 'reason': '' } + Positive forward=ahead, positive left=left, positive degrees=CCW. + Keep |forward|≤1.0, |left|≤0.5, |degrees|≤45 per step." + 4. parse JSON; if action=='arrived' or 'stop', POST /stop and break + 5. POST /api/robot/navigate/steps a single PathStep with the chosen + values + duration=step_seconds, wait for response + 6. SSE-stream {step, action, reason, pose} so the UI shows progress + +env config (read at startup): + DIMOS_CAMERA_URL = http://127.0.0.1:7780/snapshot/color_image + DIMOS_POSE_URL = http://127.0.0.1:7782/pose + DIMOS_PATH_URL = http://127.0.0.1:7782/path + DIMOS_STOP_URL = http://127.0.0.1:7782/stop + DIMOS_PLAY_URL = http://127.0.0.1:7781/play (optional, for spoken feedback) +``` + +Failure modes the loop must handle: +- camera/pose endpoint 5xx or timeout (skip the step, log, retry once) +- JSON parse failure from the model (treat as `stop`) +- per-step Twist limits exceeded by the model (clamp, log) +- max_steps hit without `arrived` (return `{status: "exhausted", history: […]}`) + +UI (optional, in `static/`): +- "Goal" text input + "Go" button → POST /api/robot/navigate +- live event stream → render the latest camera frame, current pose, last + reason, step counter +- big red "Stop" button → POST `DIMOS_STOP_URL` directly (don't wait for + the agent) + +## Audio loop (optional follow-up) +If you want the robot to speak / listen: +- Mic: open `ws://127.0.0.1:7781/audio_out`, parse the first JSON + `format` frame, buffer ~1s of PCM, wrap in a WAV header, POST to the + existing mlx-audio whisper server on `:8000`. +- Speaker: TTS output → POST raw WAV to `http://127.0.0.1:7781/play`. + +## Things NOT to do +- Don't try to import anything from dimos. All integration is over HTTP/WS. +- Don't long-poll `/path` — it blocks until the steps finish. Use shorter + paths (1–3 steps) per loop iteration so the VLM can re-evaluate. +- Don't assume metric scale from the camera — the VLM reasons about + "ahead", "left", "approaching", not centimeters. Conservative step sizes + (0.3–0.6 m forward, ≤30° turns) are safer. + +## Testing without a real robot +The dimos sim publishes the same endpoints. Start it with: +``` +cd ~/repos/robotics/dimos +./sim-with-llm.sh mlxvlm # routes the in-sim agent's LLM to your :8080 +``` +Then in another terminal, your mlxvlm `POST /api/robot/navigate` should +drive the sim Go2 in the MuJoCo viewer, and you'll see its 3D position +change via `GET /pose`. From 5155188835838ed60965d99b16c50f8a357cde11 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 20:26:31 +0800 Subject: [PATCH 28/29] chore(go2-start): include cmd-bridge-module in default bridges BRIDGES=1 now appends camera-mjpeg-module + audio-ws-module + cmd-bridge-module, and the banner lists the new cmd_vel/path/stop/pose endpoints alongside the camera and audio URLs. Co-Authored-By: Claude Opus 4.7 (1M context) --- go2-start.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/go2-start.sh b/go2-start.sh index 538992016f..98af4d9337 100755 --- a/go2-start.sh +++ b/go2-start.sh @@ -54,7 +54,7 @@ fi # Assemble the module list once so it's reused for echo + exec. MODULES=("$BLUEPRINT") if [[ "$BRIDGES" == "1" ]]; then - MODULES+=("camera-mjpeg-module" "audio-ws-module") + MODULES+=("camera-mjpeg-module" "audio-ws-module" "cmd-bridge-module") fi if [[ -n "$EXTRA" ]]; then # shellcheck disable=SC2206 @@ -208,6 +208,10 @@ if [[ "$BRIDGES" == "1" ]]; then echo " audio info : http://127.0.0.1:7781/audio_info" echo " audio play : POST http://127.0.0.1:7781/play" fi + echo " cmd_vel : POST http://127.0.0.1:7782/cmd_vel" + echo " path : POST http://127.0.0.1:7782/path" + echo " stop : POST http://127.0.0.1:7782/stop" + echo " pose : http://127.0.0.1:7782/pose" fi echo c_bold "▶ launching: dimos $([[ "$SIMULATION" == "1" ]] && echo "--simulation ")run ${MODULES[*]} ${LLM_ARGS[*]:-}" From 76575efc9446e917b68ffdb9a3511709f76e67f2 Mon Sep 17 00:00:00 2001 From: tfius Date: Wed, 27 May 2026 20:37:12 +0800 Subject: [PATCH 29/29] fix(go2-start): auto-inject mcp-server/mcp-client for real-robot LLM mode Real-robot LLM launches (LMSTUDIO=1 ./go2-start.sh) used to forget the McpClient, so the -o mcpclient.model=... override silently bound to nothing and LM Studio sat unused. go2-start.sh now mirrors what sim-with-llm.sh was doing: when an LLM preset is set against a non-agentic blueprint, mcp-server + mcp-client are appended to EXTRA. The check is skipped when the blueprint already includes an agent (e.g. unitree-go2-agentic-ollama). sim-with-llm.sh becomes a thin wrapper: it only picks the backend preset and forwards to go2-start.sh with SIMULATION=1. Co-Authored-By: Claude Opus 4.7 (1M context) --- go2-start.sh | 36 +++++++++++++++++++++++------------- sim-with-llm.sh | 34 +++++++++------------------------- 2 files changed, 32 insertions(+), 38 deletions(-) diff --git a/go2-start.sh b/go2-start.sh index 98af4d9337..180cd6a77c 100755 --- a/go2-start.sh +++ b/go2-start.sh @@ -51,16 +51,6 @@ if [[ "$LMSTUDIO" == "1" && "$MLXVLM" == "1" ]]; then exit 1 fi -# Assemble the module list once so it's reused for echo + exec. -MODULES=("$BLUEPRINT") -if [[ "$BRIDGES" == "1" ]]; then - MODULES+=("camera-mjpeg-module" "audio-ws-module" "cmd-bridge-module") -fi -if [[ -n "$EXTRA" ]]; then - # shellcheck disable=SC2206 - MODULES+=( $EXTRA ) -fi - # Optional LLM endpoint override + model selection for the McpClient. LLM_ARGS=() LLM_NAME="" @@ -75,13 +65,33 @@ elif [[ "$MLXVLM" == "1" ]]; then LLM_URL="http://127.0.0.1:8080/v1" LLM_MODEL="$MLXVLM_MODEL" fi + +# When an LLM preset is set but the chosen blueprint has no McpClient, +# auto-inject mcp-server + mcp-client so the override actually binds. +# (Mirrors what sim-with-llm.sh does, so real-robot LLM mode behaves the +# same as sim.) +if [[ -n "$LLM_URL" && "$BLUEPRINT" != *"agentic"* ]]; then + if [[ -z "$EXTRA" ]]; then + EXTRA="mcp-server mcp-client" + elif [[ "$EXTRA" != *"mcp-client"* ]]; then + EXTRA="$EXTRA mcp-server mcp-client" + fi +fi + +# Assemble the module list once so it's reused for echo + exec. +MODULES=("$BLUEPRINT") +if [[ "$BRIDGES" == "1" ]]; then + MODULES+=("camera-mjpeg-module" "audio-ws-module" "cmd-bridge-module") +fi +if [[ -n "$EXTRA" ]]; then + # shellcheck disable=SC2206 + MODULES+=( $EXTRA ) +fi + if [[ -n "$LLM_URL" ]]; then export OPENAI_BASE_URL="$LLM_URL" export OPENAI_API_KEY="${OPENAI_API_KEY:-local-llm}" # placeholder; servers ignore it LLM_ARGS=( -o "mcpclient.model=openai:$LLM_MODEL" ) - if [[ "$BLUEPRINT" != *"agentic"* ]]; then - c_yellow "⚠ LLM preset is set but blueprint '$BLUEPRINT' has no McpClient — override is a no-op" - fi fi c_bold "▶ Go2 hackathon quickstart" diff --git a/sim-with-llm.sh b/sim-with-llm.sh index 991ddf1752..0f30e3394e 100755 --- a/sim-with-llm.sh +++ b/sim-with-llm.sh @@ -1,41 +1,26 @@ #!/usr/bin/env bash -# sim-with-llm.sh — sim + Mac-friendly agentic + local LLM, one command. +# sim-with-llm.sh — sim + local LLM, one command. # -# Defaults to `unitree-go2-agentic-gemini` because it's the only agentic -# variant that boots on Apple Silicon: it disables SecurityModule (EdgeTAM -# needs CUDA), swaps Moondream (crashes Metal) → Gemini for VL, and replaces -# the OpenAI-hardcoded TTS with Gemini TTS. We only override its *chat* LLM -# (the McpClient) to point at LM Studio or mlxvlm. The VL/embedding/TTS -# pieces still need GOOGLE_API_KEY. +# Thin wrapper around go2-start.sh: sets SIMULATION=1 and the backend preset. +# go2-start.sh handles the rest (auto-injects mcp-server/mcp-client when an +# LLM preset is set against a non-agentic blueprint). # # Usage: # ./sim-with-llm.sh # mlxvlm Gemma-4 (default chat LLM) # ./sim-with-llm.sh lmstudio # LM Studio -# ./sim-with-llm.sh mlxvlm qwen3 # mlxvlm with a different loaded model +# ./sim-with-llm.sh mlxvlm qwen3 # mlxvlm with a specific loaded model # ./sim-with-llm.sh ollama # local ollama (uses native ollama blueprint) # -# Override the base blueprint if you need a different shape: -# BLUEPRINT=unitree-go2-basic ./sim-with-llm.sh lmstudio # no agent, just streams -# -# Required env (for VL/embedding/TTS on Mac): -# GOOGLE_API_KEY=… # https://aistudio.google.com/app/apikey +# Override the base blueprint if you want a different shape: +# BLUEPRINT=unitree-go2-basic ./sim-with-llm.sh lmstudio # default +# BLUEPRINT=unitree-go2-agentic-ollama ./sim-with-llm.sh # full agentic compose set -euo pipefail BACKEND="${1:-mlxvlm}" shift || true -# Minimal Mac+local-LLM compose that imports cleanly without google-genai: -# unitree-go2-basic (camera + 3D viz) -# + mcp-server + mcp-client (the MCP agent loop) -# Skills exposed (from GO2Connection): observe, play_wav, play_wav_b64. -# -# To add movement / sport / tilt skills, also pass: -# EXTRA="mcp-server mcp-client unitree-skill-container replanning-a-star-planner" -# (the planner satisfies UnitreeSkillContainer's NavigationInterfaceSpec dep, -# and pulls in the nav-stack chain — heavier but enables relative_move). BLUEPRINT="${BLUEPRINT:-unitree-go2-basic}" -export EXTRA="${EXTRA:-mcp-server mcp-client}" case "$BACKEND" in mlxvlm) @@ -47,9 +32,8 @@ case "$BACKEND" in if [[ -n "${1:-}" ]]; then export LMSTUDIO_MODEL="$1"; fi ;; ollama) - # The ollama agentic blueprint is already a clean Mac compose. + # The ollama agentic blueprint already composes mcp-server/mcp-client. BLUEPRINT="unitree-go2-agentic-ollama" - export EXTRA="" ;; *) echo "unknown backend: $BACKEND (use mlxvlm | lmstudio | ollama)" >&2