diff --git a/.gitignore b/.gitignore index aedee04af7..a3c0a05a4a 100644 --- a/.gitignore +++ b/.gitignore @@ -86,3 +86,8 @@ htmlcov/ # Memory2 autorecord recording*.db +recording*.db-wal +recording*.db-shm + +# MuJoCo runtime log +MUJOCO_LOG.TXT diff --git a/dimos/agents/skills/gemini_speak_skill.py b/dimos/agents/skills/gemini_speak_skill.py new file mode 100644 index 0000000000..aecdad9cbe --- /dev/null +++ b/dimos/agents/skills/gemini_speak_skill.py @@ -0,0 +1,145 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A speak skill backed by Google's Gemini TTS API. + +Drop-in replacement for ``SpeakSkill`` that reuses ``GOOGLE_API_KEY`` (already +used for the LLM and embeddings) instead of an OpenAI key, and works on any +platform with an audio output device. Satisfies ``SpeakSkillSpec`` +(``speak(text, blocking=True) -> str``). +""" + +import threading +import time + +from reactivex import Subject + +from dimos.agents.annotation import skill +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.stream.audio.node_output import SounddeviceAudioOutput +from dimos.stream.audio.tts.node_gemini import GeminiTTSNode +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class GeminiSpeakSkillConfig(ModuleConfig): + # Prebuilt Gemini voice name (e.g. "Kore", "Puck", "Charon", "Aoede"). + voice: str = "Kore" + # Gemini TTS model; must be a `*-preview-tts` model to emit audio. + model: str = "gemini-2.5-flash-preview-tts" + + +class GeminiSpeakSkill(Module): + config: GeminiSpeakSkillConfig + _tts_node: GeminiTTSNode | None = None + _audio_output: SounddeviceAudioOutput | None = None + _audio_lock: threading.Lock = threading.Lock() + _text_subject: "Subject[str] | None" = None + + @rpc + def start(self) -> None: + super().start() + self._tts_node = GeminiTTSNode(voice=self.config.voice, model=self.config.model) + self._audio_output = SounddeviceAudioOutput(sample_rate=24000) + self._audio_output.consume_audio(self._tts_node.emit_audio()) + # Wire the text pipeline ONCE. Each speak() just pushes onto this subject; + # the TTS node's own worker drains it FIFO. (Previously consume_text was + # called per speak(), spawning a fresh worker thread + subscription every + # call — a leak, and the source of the repeated "Starting GeminiTTSNode".) + self._text_subject = Subject() + self._tts_node.consume_text(self._text_subject) + + @rpc + def stop(self) -> None: + if self._tts_node: + # dispose() clears the queue and joins the worker, so in-flight/queued + # speech is torn down here — no separate bg-thread bookkeeping needed. + self._tts_node.dispose() + self._tts_node = None + if self._audio_output: + self._audio_output.stop() + self._audio_output = None + self._text_subject = None + super().stop() + + @skill + def speak(self, text: str, blocking: bool = False) -> str: + """Speak text out loud through the robot's speakers. + + USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak. + + Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly. + + Returns immediately by default (the audio plays in the background); pass + ``blocking=True`` only when you must wait until the utterance finishes. + + Example usage: + + speak("Hello, I am your robot assistant.") + """ + if self._tts_node is None or self._text_subject is None: + return "Error: TTS not initialized" + + if not blocking: + # Fire-and-forget: enqueue on the shared pipeline and return now. + self._text_subject.on_next(text) + return f"Speaking (non-blocking): {text}" + + return self._speak_blocking(text) + + def _speak_blocking(self, text: str) -> str: + # Serialize blocking speech so utterances don't overlap on the speaker. + with self._audio_lock: + if self._tts_node is None or self._text_subject is None: + return "Error: TTS not initialized" + + audio_complete = threading.Event() + + # emit_text() re-emits the exact utterance once its synthesis finishes; + # match on the text so a concurrent non-blocking speak can't trip us. + def on_text(t: str) -> None: + if t == text: + audio_complete.set() + + def on_error(_e: Exception) -> None: + audio_complete.set() + + subscription = self._tts_node.emit_text().subscribe( + on_next=on_text, + on_error=on_error, + ) + + self._text_subject.on_next(text) + + # Gemini synthesis is a network round-trip; allow more headroom than + # the local-output time so first-token latency doesn't trip the wait. + timeout = max(15, len(text) * 0.1) + try: + if not audio_complete.wait(timeout=timeout): + logger.warning(f"TTS timeout reached for: {text}") + return f"Warning: TTS timeout while speaking: {text}" + # Small delay to ensure buffers flush. + time.sleep(0.3) + return f"Spoke: {text}" + finally: + subscription.dispose() + + +if __name__ == "__main__": + skill_module = GeminiSpeakSkill() + skill_module.start() + print(skill_module.speak("Hello, I am your robot assistant, powered by Gemini.", blocking=True)) + skill_module.stop() diff --git a/dimos/agents/skills/local_speak_skill.py b/dimos/agents/skills/local_speak_skill.py new file mode 100644 index 0000000000..1af0e8666a --- /dev/null +++ b/dimos/agents/skills/local_speak_skill.py @@ -0,0 +1,122 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A speak skill backed by the macOS ``say`` command. + +Drop-in replacement for ``SpeakSkill`` that requires no OpenAI key and no +audio pipeline: ``say`` synthesizes and plays the audio itself. macOS only. +Satisfies ``SpeakSkillSpec`` (``speak(text, blocking=True) -> str``). +""" + +import shutil +import subprocess +import threading + +from dimos.agents.annotation import skill +from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class LocalSpeakSkillConfig(ModuleConfig): + # macOS `say` voice name (e.g. "Daniel", "Samantha"); None = system default. + voice: str | None = None + # Speech rate in words per minute; None = `say` default (~175). + rate: int | None = None + + +class LocalSpeakSkill(Module): + """Speak text out loud through the local macOS ``say`` command.""" + + config: LocalSpeakSkillConfig + _bg_threads: list[threading.Thread] = [] + _bg_threads_lock: threading.Lock = threading.Lock() + + @rpc + def start(self) -> None: + super().start() + if shutil.which("say") is None: + logger.warning( + "LocalSpeakSkill: `say` not found on PATH; speak() will no-op. " + "This skill is macOS-only." + ) + + @rpc + def stop(self) -> None: + with self._bg_threads_lock: + threads = list(self._bg_threads) + for t in threads: + t.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + super().stop() + + def _command(self, text: str) -> list[str]: + cmd = ["say"] + if self.config.voice: + cmd += ["-v", self.config.voice] + if self.config.rate: + cmd += ["-r", str(self.config.rate)] + cmd.append(text) + return cmd + + @skill + def speak(self, text: str, blocking: bool = True) -> str: + """Speak text out loud through the robot's speakers. + + USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak. + + Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly. + + Example usage: + + speak("Hello, I am your robot assistant.") + """ + if shutil.which("say") is None: + return "Error: `say` command not available (LocalSpeakSkill is macOS-only)" + + if not text.strip(): + return "Error: nothing to speak" + + if not blocking: + thread = threading.Thread( + target=self._speak_blocking, args=(text,), daemon=True, name="LocalSpeakSkill-bg" + ) + with self._bg_threads_lock: + self._bg_threads.append(thread) + thread.start() + return f"Speaking (non-blocking): {text}" + + return self._speak_blocking(text) + + def _speak_blocking(self, text: str) -> str: + try: + subprocess.run(self._command(text), check=True) + except subprocess.CalledProcessError as e: + logger.error(f"`say` failed: {e}") + return f"Error: failed to speak: {text}" + finally: + with self._bg_threads_lock: + self._bg_threads = [ + t for t in self._bg_threads if t is not threading.current_thread() + ] + return f"Spoke: {text}" + + +if __name__ == "__main__": + skill_module = LocalSpeakSkill() + skill_module.start() + print(skill_module.speak("Hello, I am your robot assistant, powered by Gemini.")) + skill_module.stop() diff --git a/dimos/agents/skills/map_uploader.py b/dimos/agents/skills/map_uploader.py new file mode 100644 index 0000000000..9cbcde99f1 --- /dev/null +++ b/dimos/agents/skills/map_uploader.py @@ -0,0 +1,91 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Periodically upload the 2D occupancy map (global_costmap) to the robomoo app. + +Subscribes to `global_costmap`, encodes it as a *value-preserving* grayscale PNG +(free=0, occupied=1..100, unknown=255 — NOT a pre-colored image), throttles, and +POSTs it plus grid metadata (resolution, origin, width, height) to robomoo's +`/api/robot/map`. The web app reads the raw cell values back and applies its own +colormap + overlays, mapping world→pixel via +`col = (x - originX) / resolution`, `row = (y - originY) / resolution`. + +Env: ROBOMOO_URL, ROBOT_INGEST_TOKEN. +""" + +import os +import time + +import cv2 +import numpy as np +import httpx + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In +from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class MapUploaderConfig(ModuleConfig): + robomoo_url: str = os.getenv("ROBOMOO_URL", "") + ingest_token: str = os.getenv("ROBOT_INGEST_TOKEN", "") + min_period_s: float = 5.0 # throttle: at most one upload every N seconds + + +class MapUploader(Module): + config: MapUploaderConfig + global_costmap: In[OccupancyGrid] + + @rpc + def start(self) -> None: + super().start() + self._last = 0.0 + self.global_costmap.subscribe(self._on_costmap) + + def _on_costmap(self, grid: OccupancyGrid) -> None: + now = time.monotonic() + if now - self._last < self.config.min_period_s: + return + url = self.config.robomoo_url + token = self.config.ingest_token + if not url or not token: + return + self._last = now + + try: + # Value-preserving encoding: free/occupied 0..100 stay as-is, unknown + # (-1) → 255. The web recolors from these raw values, so we never bake + # a colormap into the upload. (H, W) uint8 → grayscale PNG. + enc = np.where(grid.grid == -1, 255, np.clip(grid.grid, 0, 100)).astype(np.uint8) + ok, buf = cv2.imencode(".png", enc) + if not ok: + return + httpx.post( + f"{url.rstrip('/')}/api/robot/map", + headers={"Authorization": f"Bearer {token}"}, + files={"file": ("map.png", buf.tobytes(), "image/png")}, + data={ + "resolution": str(grid.resolution), + "originX": str(grid.origin.position.x), + "originY": str(grid.origin.position.y), + "width": str(grid.width), + "height": str(grid.height), + }, + timeout=30.0, + ).raise_for_status() + except Exception as e: # noqa: BLE001 — best-effort; never break the stream + logger.warning("map upload failed: %s", e) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index d88bec452e..111ba27700 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -19,9 +19,11 @@ from dimos.agents.annotation import skill from dimos.core.core import rpc -from dimos.core.module import Module +from dimos.core.module import Module, ModuleConfig from dimos.core.stream import In from dimos.models.qwen.bbox import BBox +from dimos.models.vl.create import create +from dimos.models.vl.types import VlModelName from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped from dimos.msgs.geometry_msgs.Quaternion import Quaternion from dimos.msgs.geometry_msgs.Vector3 import Vector3, make_vector3 @@ -37,7 +39,14 @@ logger = setup_logger() +class Config(ModuleConfig): + # VL model used to detect objects in view for semantic navigation. + vl_model_name: VlModelName = "qwen" + + class NavigationSkillContainer(Module): + config: Config + _latest_image: Image | None = None _latest_odom: PoseStamped | None = None _skill_started: bool = False @@ -54,10 +63,7 @@ def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) self._skill_started = False - # Here to prevent unwanted imports in the file. - from dimos.models.vl.qwen import QwenVlModel - - self._vl_model = QwenVlModel() + self._vl_model = create(self.config.vl_model_name) @rpc def start(self) -> None: diff --git a/dimos/agents/skills/person_follow.py b/dimos/agents/skills/person_follow.py index 4175898e45..35ec768fa2 100644 --- a/dimos/agents/skills/person_follow.py +++ b/dimos/agents/skills/person_follow.py @@ -15,7 +15,7 @@ import base64 from threading import Event, RLock, Thread import time -from typing import Any +from typing import Any, Literal import numpy as np from reactivex.disposable import Disposable @@ -30,6 +30,7 @@ from dimos.models.segmentation.edge_tam import EdgeTAMProcessor from dimos.models.vl.base import VlModel from dimos.models.vl.create import create +from dimos.models.vl.types import VlModelName from dimos.msgs.geometry_msgs.Twist import Twist from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo from dimos.msgs.sensor_msgs.Image import Image, ImageFormat @@ -46,6 +47,12 @@ class Config(ModuleConfig): camera_info: CameraInfo use_3d_navigation: bool = False + # VL model used for the initial (and, in "redetect" mode, continuous) detection. + vl_model_name: VlModelName = "qwen" + # "edgetam": GPU frame-to-frame tracking (default where CUDA exists). + # "redetect": periodic re-detection with the VL model (no GPU, runs on Mac). + # "auto": pick "edgetam" if CUDA is available, else "redetect". + tracking_mode: Literal["auto", "edgetam", "redetect"] = "auto" class PersonFollowSkillContainer(Module): @@ -66,13 +73,15 @@ class PersonFollowSkillContainer(Module): _frequency: float = 20.0 # Hz - control loop frequency _max_lost_frames: int = 15 # number of frames to wait before declaring person lost + _lost_timeout: float = 5.0 # seconds without a re-detection before declaring person lost + _redetect_period: float = 0.8 # seconds between VL re-anchors in "redetect" mode _patrolling_module_spec: PatrollingModuleSpec def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) self._latest_image: Image | None = None self._latest_pointcloud: PointCloud2 | None = None - self._vl_model: VlModel = create("qwen") + self._vl_model: VlModel = create(self.config.vl_model_name) self._tracker: EdgeTAMProcessor | None = None self._thread: Thread | None = None self._should_stop: Event = Event() @@ -206,9 +215,24 @@ def _on_pointcloud(self, pointcloud: PointCloud2) -> None: with self._lock: self._latest_pointcloud = pointcloud + def _resolve_tracking_mode(self) -> Literal["edgetam", "redetect"]: + """Resolve the effective tracking mode, auto-selecting by hardware.""" + mode = self.config.tracking_mode + if mode != "auto": + return mode + try: + import torch + + return "edgetam" if torch.cuda.is_available() else "redetect" + except Exception: + return "redetect" + def _follow_person( self, query: str, initial_bbox: BBox, detection_image: Image | None = None ) -> str: + if self._resolve_tracking_mode() == "redetect": + return self._follow_person_redetect(query) + x1, y1, x2, y2 = initial_bbox box = np.array([x1, y1, x2, y2], dtype=np.float32) @@ -310,6 +334,145 @@ def _follow_loop(self, tracker: "EdgeTAMProcessor", query: str) -> None: self._send_stop_reason(query, "it was requested to stop following") + def _follow_person_redetect(self, query: str) -> str: + """Start following using periodic VL re-detection (no GPU tracker). + + Used when no CUDA GPU is available (e.g. on a Mac). The initial detection + has already succeeded by the time this is called; the loop keeps + re-detecting the person with the VL model and servoing toward the bbox. + """ + self.start_tool("follow_person") + self._thread = Thread(target=self._follow_loop_redetect, args=(query,), daemon=True) + self._thread.start() + + message = ( + "Found the person. Starting to follow. You can stop following by calling " + "the 'stop_following' tool. You will receive streaming updates." + ) + + if self._patrolling_module_spec.is_patrolling(): + message += ( + " Note: since the robot was patrolling, this has been stopped automatically " + "(the equivalent of calling the `stop_patrol` tool call) so you don't have " + "to do it. " + ) + self._patrolling_module_spec.stop_patrol() + + return message + + @staticmethod + def _create_tracker() -> Any: + """Best available OpenCV single-object tracker, or None. + + Prefers CSRT > KCF > MIL and checks both the main and legacy namespaces, + so it auto-upgrades to CSRT where opencv-contrib is installed and falls + back to MIL (present in base OpenCV) otherwise. + """ + import cv2 + + for ns in (cv2, getattr(cv2, "legacy", None)): + if ns is None: + continue + for name in ("TrackerCSRT_create", "TrackerKCF_create", "TrackerMIL_create"): + ctor = getattr(ns, name, None) + if ctor is not None: + return ctor() + return None + + def _follow_loop_redetect(self, query: str) -> None: + """Follow without a GPU tracker (e.g. on a Mac). + + A single Gemini detection takes ~1-2s, so running it inline would cap the + control loop at ~0.5-1 Hz and the robot would act on stale commands. Instead + a background thread re-detects every ``_redetect_period`` seconds to *anchor* + a cheap local OpenCV tracker, while this control loop runs at ``_frequency`` + Hz: each cycle it updates the tracker locally and publishes a fresh twist. + """ + det_lock = RLock() + # Latest VL detection handed from the detect thread to the control loop. + det: dict[str, Any] = {"bbox": None, "image": None, "seq": 0} + + def _detect_worker() -> None: + while not self._should_stop.is_set(): + with self._lock: + img = self._latest_image + if img is None: + self._should_stop.wait(0.05) + continue + bbox = get_object_bbox_from_image(self._vl_model, img, query) + if bbox is not None: + with det_lock: + det["bbox"] = bbox + det["image"] = img + det["seq"] += 1 + # The call itself already takes ~1-2s; pace re-anchors on top. + self._should_stop.wait(self._redetect_period) + + det_thread = Thread(target=_detect_worker, daemon=True, name="follow-redetect") + det_thread.start() + + period = 1.0 / self._frequency + next_time = time.monotonic() + last_good_time = time.monotonic() + tracker: Any = None + last_seq = -1 + + try: + while not self._should_stop.is_set(): + next_time += period + + with self._lock: + latest_image = self._latest_image + if latest_image is None: + self._sleep_until(next_time) + continue + + with det_lock: + seq, det_bbox, det_image = det["seq"], det["bbox"], det["image"] + + bbox = None + if seq != last_seq and det_bbox is not None: + # Fresh detection: (re)anchor the local tracker on it. + last_seq = seq + tracker = self._create_tracker() + if tracker is not None: + anchor = det_image.data if det_image is not None else latest_image.data + x1, y1, x2, y2 = det_bbox + try: + tracker.init(anchor, (int(x1), int(y1), int(x2 - x1), int(y2 - y1))) + except Exception as e: # noqa: BLE001 — tracker init is best-effort + logger.warning("tracker init failed, detection-only: %s", e) + tracker = None + bbox = det_bbox + last_good_time = time.monotonic() + elif tracker is not None: + # Between detections: cheap local update keeps the loop at rate. + ok, box = tracker.update(latest_image.data) + if ok: + x, y, w, h = box + bbox = (float(x), float(y), float(x + w), float(y + h)) + last_good_time = time.monotonic() + + if bbox is not None: + twist = self._visual_servo.compute_twist(bbox, latest_image.width) + self.cmd_vel.publish(twist) + elif time.monotonic() - last_good_time > self._lost_timeout: + self.cmd_vel.publish(Twist.zero()) + self._send_stop_reason(query, "lost track of the person") + return + # else: no bbox yet but within timeout — hold the last command. + + self._sleep_until(next_time) + finally: + det_thread.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT) + + self._send_stop_reason(query, "it was requested to stop following") + + def _sleep_until(self, next_time: float) -> None: + sleep_duration = next_time - time.monotonic() + if sleep_duration > 0: + time.sleep(sleep_duration) + def _stop_following(self) -> None: self._should_stop.set() diff --git a/dimos/agents/skills/take_picture_skill.py b/dimos/agents/skills/take_picture_skill.py new file mode 100644 index 0000000000..1920254987 --- /dev/null +++ b/dimos/agents/skills/take_picture_skill.py @@ -0,0 +1,312 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Capture robot camera frames and upload them to the robomoo app. + +Skills: + - take_picture: one-shot capture of the current frame. + - explore_and_capture: start autonomous exploration and take a photo every few + seconds as the robot moves (deterministic — drives the cadence itself rather + than relying on the agent to keep calling take_picture). + +Each frame is JPEG-encoded and POSTed (with the robot's odom pose) to robomoo's +`/api/robot/frame` (shared-secret bearer token). Configure via env: + + ROBOMOO_URL=https://gateway-...up.railway.app + ROBOT_INGEST_TOKEN= +""" + +import os +import threading +import time + +import cv2 +import httpx +from dimos_lcm.std_msgs import Bool + +from dimos.agents.annotation import skill +from dimos.agents.skill_result import SkillResult +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.sensor_msgs.Image import Image +from dimos.navigation.frontier_exploration.frontier_explorer_spec import ( + FrontierExplorerSpec, +) +from dimos.robot.unitree.tilt_spec import TiltSpec +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class TakePictureSkillConfig(ModuleConfig): + robomoo_url: str = os.getenv("ROBOMOO_URL", "") + ingest_token: str = os.getenv("ROBOT_INGEST_TOKEN", "") + + +class TakePictureSkill(Module): + config: TakePictureSkillConfig + color_image: In[Image] + odom: In[PoseStamped] + # Drive frontier exploration (consumed by WavefrontFrontierExplorer). + explore_cmd: Out[Bool] + stop_explore_cmd: Out[Bool] + # Auto-wired (structurally) to WavefrontFrontierExplorer — lets us gate the + # capture loop on whether exploration is still running. + _explorer: FrontierExplorerSpec + # Auto-wired (structurally) to UnitreeSkillContainer.tilt_body — lets + # tilt_and_capture aim the body-fixed camera without owning the connection. + _tilt: TiltSpec + + @rpc + def start(self) -> None: + super().start() + self._latest: Image | None = None + self._pose: PoseStamped | None = None + self._capture_thread: threading.Thread | None = None + self._capture_stop = threading.Event() + # Outstanding fire-and-forget upload threads from take_picture(). + self._uploads: list[threading.Thread] = [] + self._uploads_lock = threading.Lock() + self.color_image.subscribe(self._on_image) + self.odom.subscribe(self._on_odom) + + @rpc + def stop(self) -> None: + self._capture_stop.set() + thread = getattr(self, "_capture_thread", None) + if thread is not None and thread.is_alive(): + thread.join(timeout=5.0) + with self._uploads_lock: + uploads = list(self._uploads) + for t in uploads: + if t.is_alive(): + t.join(timeout=5.0) + super().stop() + + def _on_image(self, image: Image) -> None: + self._latest = image + + def _on_odom(self, pose: PoseStamped) -> None: + self._pose = pose + + def _configured(self) -> bool: + return bool(self.config.robomoo_url and self.config.ingest_token) + + # Encode a given frame and POST it (with pose) to robomoo. Returns the stored + # key, or None if there's no frame / encode failed. Raises on HTTP error. + def _upload_frame( + self, + frame: Image | None, + pose: PoseStamped | None, + note: str = "", + label: str = "", + ) -> str | None: + if frame is None: + return None + ok, buf = cv2.imencode(".jpg", frame.data) + if not ok: + return None + + data: dict[str, str] = {} + if note: + data["note"] = note + if label: + data["label"] = label + if pose is not None: + data["poseX"] = str(pose.position.x) + data["poseY"] = str(pose.position.y) + + resp = httpx.post( + f"{self.config.robomoo_url.rstrip('/')}/api/robot/frame", + headers={"Authorization": f"Bearer {self.config.ingest_token}"}, + files={"file": ("frame.jpg", buf.tobytes(), "image/jpeg")}, + data=data, + timeout=30.0, + ) + resp.raise_for_status() + return resp.json().get("key", "") + + # Thin wrapper used by the explore capture loop: upload the latest frame/pose. + def _upload_current(self, note: str = "", label: str = "") -> str | None: + return self._upload_frame( + getattr(self, "_latest", None), + getattr(self, "_pose", None), + note=note, + label=label, + ) + + @skill + def take_picture(self, note: str = "") -> SkillResult: + """Capture a photo from the robot's camera and upload it. + + Use whenever the user asks the robot to take or capture a single picture + or photo of what it currently sees. `note` is an optional short caption + to tag the image with (e.g. "kitchen", "plant"). Returns immediately; the + encode + upload happen in the background. + """ + frame = getattr(self, "_latest", None) + if frame is None: + return SkillResult.fail("NO_FRAME", "No camera frame received yet") + if not self._configured(): + return SkillResult.fail( + "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" + ) + + # Snapshot the frame + pose now so the background upload sends exactly what + # the robot saw at call time, not a later frame. + pose = getattr(self, "_pose", None) + + def _bg() -> None: + try: + key = self._upload_frame(frame, pose, note=note, label=note) + logger.info("take_picture uploaded frame key=%s", key) + except Exception: # noqa: BLE001 — fire-and-forget: failures only logged + logger.exception("take_picture upload failed") + + t = threading.Thread(target=_bg, daemon=True, name="take-picture-upload") + with self._uploads_lock: + # Drop finished threads so the list doesn't grow unbounded. + self._uploads = [u for u in self._uploads if u.is_alive()] + self._uploads.append(t) + t.start() + return SkillResult.ok("Picture captured; uploading in the background.") + + @skill + def tilt_and_capture( + self, + pitch_deg: float = -20.0, + note: str = "", + settle_s: float = 1.0, + ) -> SkillResult: + """Tilt the body to aim the camera, photograph that view, then re-level. + + Use to photograph things above or below the robot's straight-ahead view + (the camera is body-fixed). NEGATIVE pitch_deg looks UP, positive looks + DOWN. Runs in the background: tilts, waits `settle_s` for the body to + settle, captures + uploads the tilted view, then returns the body to + level. Returns immediately. The robot should be standing first. + """ + if getattr(self, "_latest", None) is None: + return SkillResult.fail("NO_FRAME", "No camera frame received yet") + if not self._configured(): + return SkillResult.fail( + "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" + ) + + def _bg() -> None: + try: + self._tilt.tilt_body(pitch_deg=pitch_deg) + # Wait for the body to physically reach the pose and a fresh + # camera frame to arrive before snapshotting. + time.sleep(settle_s) + key = self._upload_frame( + getattr(self, "_latest", None), + getattr(self, "_pose", None), + note=note, + label=note, + ) + logger.info("tilt_and_capture uploaded frame key=%s", key) + except Exception: # noqa: BLE001 — fire-and-forget: failures only logged + logger.exception("tilt_and_capture failed") + finally: + # Always return the body to level, even if capture failed. + try: + self._tilt.tilt_body() + except Exception: # noqa: BLE001 — best effort + logger.exception("tilt_and_capture re-level failed") + + t = threading.Thread(target=_bg, daemon=True, name="tilt-and-capture") + with self._uploads_lock: + self._uploads = [u for u in self._uploads if u.is_alive()] + self._uploads.append(t) + t.start() + return SkillResult.ok( + f"Tilting to pitch={pitch_deg} deg, capturing, then re-leveling (background)." + ) + + @skill + def explore_and_capture( + self, + interval_s: float = 4.0, + max_duration_s: float = 600.0, + note: str = "exploring", + ) -> SkillResult: + """Explore the room and keep taking photos until exploration is complete. + + Use when the user asks the robot to explore / wander / look around AND + take pictures (or capture/photograph) as it goes. Starts autonomous + frontier exploration, then captures and uploads a frame every + `interval_s` seconds for as long as the robot is still exploring — it + stops on its own once the room is fully explored. `max_duration_s` is + only a safety cap. Returns immediately; capturing runs in the background. + """ + if not self._configured(): + return SkillResult.fail( + "NOT_CONFIGURED", "ROBOMOO_URL / ROBOT_INGEST_TOKEN not set" + ) + + # Cancel any in-flight run before starting a new one. + self._capture_stop.set() + if self._capture_thread is not None and self._capture_thread.is_alive(): + self._capture_thread.join(timeout=5.0) + self._capture_stop = threading.Event() + + self.explore_cmd.publish(Bool(data=True)) + self._capture_thread = threading.Thread( + target=self._capture_loop, + args=(interval_s, max_duration_s, note), + daemon=True, + name="explore-and-capture", + ) + self._capture_thread.start() + + return SkillResult.ok( + f"Exploring and capturing a photo every {interval_s:.0f}s until the " + "room is fully explored." + ) + + def _exploring(self) -> bool: + try: + return bool(self._explorer.is_exploration_active()) + except Exception as e: # noqa: BLE001 — if the ref errors, keep capturing + logger.warning("is_exploration_active() failed: %s", e) + return True + + def _capture_loop(self, interval_s: float, max_duration_s: float, note: str) -> None: + start = time.monotonic() + grace_s = 6.0 # let exploration spin up before trusting the active flag + count = 0 + try: + while not self._capture_stop.is_set(): + elapsed = time.monotonic() - start + if elapsed > max_duration_s: + logger.info("explore_and_capture hit max_duration_s cap") + break + if elapsed > grace_s and not self._exploring(): + logger.info("exploration finished — stopping capture loop") + break + try: + if self._upload_current(note=note, label=note): + count += 1 + except Exception as e: # noqa: BLE001 — keep going on transient errors + logger.warning("explore_and_capture upload failed: %s", e) + self._capture_stop.wait(interval_s) + finally: + try: + self.stop_explore_cmd.publish(Bool(data=True)) + except Exception: # noqa: BLE001 — best effort on shutdown + pass + logger.info("explore_and_capture finished: uploaded %d photos", count) diff --git a/dimos/agents_deprecated/memory/image_embedding.py b/dimos/agents_deprecated/memory/image_embedding.py index 373f8c5663..939159cca8 100644 --- a/dimos/agents_deprecated/memory/image_embedding.py +++ b/dimos/agents_deprecated/memory/image_embedding.py @@ -45,12 +45,18 @@ class ImageEmbeddingProvider: that can be stored in a vector database and used for similarity search. """ + # Remote multimodal embedding model used when model_name == "gemini". + GEMINI_EMBED_MODEL = "gemini-embedding-2" + def __init__(self, model_name: str = "clip", dimensions: int = 512) -> None: """ Initialize the image embedding provider. Args: - model_name: Name of the embedding model to use ("clip", "resnet"). + model_name: Name of the embedding model to use ("clip", "resnet", + "gemini"). "gemini" uses the remote Gemini Embedding API + (multimodal: image + text in one space) and needs + GEMINI_API_KEY or GOOGLE_API_KEY. dimensions: Dimensions of the embedding vectors """ self.model_name = model_name @@ -65,6 +71,23 @@ def __init__(self, model_name: str = "clip", dimensions: int = 512) -> None: def _initialize_model(self): # type: ignore[no-untyped-def] """Initialize the specified embedding model.""" + if self.model_name == "gemini": + from google import genai + + api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") + if not api_key: + raise ValueError( + "Gemini embeddings require GEMINI_API_KEY or GOOGLE_API_KEY to be set" + ) + # Client doubles as the "model"; sentinel processor keeps the + # None-guards in get_embedding / get_text_embedding happy. + self.model = genai.Client(api_key=api_key) # type: ignore[assignment] + self.processor = "gemini" # type: ignore[assignment] + logger.info( + f"Using remote Gemini embeddings: {self.GEMINI_EMBED_MODEL} " + f"({self.dimensions}d)" + ) + return try: import onnxruntime as ort # type: ignore[import-untyped] import torch # noqa: F401 @@ -119,11 +142,23 @@ def get_embedding(self, image: np.ndarray | str | bytes) -> np.ndarray: A numpy array containing the embedding vector """ if self.model is None or self.processor is None: - logger.error("Model not initialized. Using fallback random embedding.") - return np.random.randn(self.dimensions).astype(np.float32) + raise RuntimeError( + f"Image embedding model '{self.model_name}' is not initialized; refusing to " + "return a random vector that would poison the vector store." + ) pil_image = self._prepare_image(image) + if self.model_name == "gemini": + # No silent fallback: a failed embedding must abort loudly rather than + # writing a random vector into the spatial DB. + from google.genai import types + + buf = io.BytesIO() + pil_image.convert("RGB").save(buf, format="PNG") + part = types.Part.from_bytes(data=buf.getvalue(), mime_type="image/png") + return self._gemini_embed_content([part]) + embedding: np.ndarray try: import torch @@ -194,8 +229,14 @@ def get_text_embedding(self, text: str) -> np.ndarray: A numpy array containing the embedding vector """ if self.model is None or self.processor is None: - logger.error("Model not initialized. Using fallback random embedding.") - return np.random.randn(self.dimensions).astype(np.float32) + raise RuntimeError( + f"Text embedding model '{self.model_name}' is not initialized; refusing to " + "return a random vector that would poison the vector store." + ) + + if self.model_name == "gemini": + # No silent fallback: surface the failure instead of corrupting the store. + return self._gemini_embed_content(text) if self.model_name != "clip": logger.warning( @@ -251,6 +292,24 @@ def get_text_embedding(self, text: str) -> np.ndarray: logger.error(f"Error generating text embedding: {e}") return np.random.randn(self.dimensions).astype(np.float32) + def _gemini_embed_content(self, contents: object) -> np.ndarray: + """Call the Gemini multimodal embedding API and return a normalized vector. + + `contents` may be a text string or a list of `types.Part` (e.g. an image). + Image and text land in the same space, so semantic text search over + stored frames keeps working. + """ + from google.genai import types + + cfg = types.EmbedContentConfig(output_dimensionality=self.dimensions) + result = self.model.models.embed_content( # type: ignore[union-attr] + model=self.GEMINI_EMBED_MODEL, contents=contents, config=cfg + ) + vec = np.array(result.embeddings[0].values, dtype=np.float32) + # Truncated (<3072d) Gemini embeddings are not pre-normalized. + norm = np.linalg.norm(vec) + return vec / norm if norm > 0 else vec + def _prepare_image(self, image: np.ndarray | str | bytes) -> Image.Image: """ Convert the input image to PIL format required by the models. diff --git a/dimos/constants.py b/dimos/constants.py index d849f4aaf3..5b83bce636 100644 --- a/dimos/constants.py +++ b/dimos/constants.py @@ -45,6 +45,11 @@ DEFAULT_CAPACITY_COLOR_IMAGE = 1920 * 1080 * 3 # Default depth image size: 1280x720 frame * 4 (float32 size) DEFAULT_CAPACITY_DEPTH_IMAGE = 1280 * 720 * 4 +# Fixed-capacity SHM channels must be sized before the first message arrives. +# These defaults cover current Go2 replay and navigation payloads while keeping +# large local streams off UDP multicast. +DEFAULT_CAPACITY_POINTCLOUD = 64 * 1024 * 1024 +DEFAULT_CAPACITY_OCCUPANCY_GRID = 16 * 1024 * 1024 # From https://github.com/lcm-proj/lcm.git LCM_MAX_CHANNEL_NAME_LENGTH = 63 diff --git a/dimos/core/coordination/module_coordinator.py b/dimos/core/coordination/module_coordinator.py index 60c41a6ba1..d0593b8e02 100644 --- a/dimos/core/coordination/module_coordinator.py +++ b/dimos/core/coordination/module_coordinator.py @@ -30,7 +30,14 @@ from dimos.core.global_config import GlobalConfig, global_config from dimos.core.module import ModuleBase, ModuleSpec from dimos.core.resource import Resource -from dimos.core.transport import LCMTransport, PubSubTransport, pLCMTransport +from dimos.core.transport import ( + JpegShmTransport, + LCMTransport, + PubSubTransport, + SHMTransport, + pLCMTransport, + pSHMTransport, +) from dimos.spec.utils import is_spec, spec_annotation_compliance, spec_structural_compliance from dimos.utils.generic import short_id from dimos.utils.logging_config import setup_logger @@ -279,6 +286,9 @@ def _connect_streams(self, blueprint: Blueprint) -> None: module=module.__name__, transport=transport.__class__.__name__, ) + # SHM streams are concrete transport objects, not LCM topics. Forward + # them to Rerun after stream wiring has resolved the transport registry. + _configure_rerun_bridge_visual_transports(self) @classmethod def build( @@ -584,6 +594,31 @@ def _get_transport_for(blueprint: Blueprint, name: str, stream_type: type) -> Pu return transport +def _configure_rerun_bridge_visual_transports(coordinator: ModuleCoordinator) -> None: + """Send resolved SHM transports to an active Rerun bridge. + + RerunBridgeModule subscribes to configured pubsubs directly. For SHM + streams, the coordinator forwards the concrete transport objects after + stream wiring has selected them. + """ + from dimos.visualization.rerun.bridge import RerunBridgeModule + + if RerunBridgeModule not in coordinator._deployed_modules: + return + + # LCM transports are already visible through RerunBridgeModule.config.pubsubs. + transports = [ + transport + for transport in coordinator._transport_registry.values() + if isinstance(transport, SHMTransport | pSHMTransport | JpegShmTransport) + ] + if not transports: + return + + bridge = coordinator.get_instance(RerunBridgeModule) + bridge.set_visual_transports(transports) + + def _verify_no_name_conflicts(blueprint: Blueprint) -> None: name_to_types: dict[Any, set[type]] = defaultdict(set) name_to_modules: dict[Any, list[tuple[type, type]]] = defaultdict(list) diff --git a/dimos/core/transport.py b/dimos/core/transport.py index 6435003758..de2ced9cbb 100644 --- a/dimos/core/transport.py +++ b/dimos/core/transport.py @@ -163,14 +163,23 @@ def stop(self) -> None: class pSHMTransport(PubSubTransport[T]): + """Pickled shared-memory transport for local Python object streams.""" + _started: bool = False def __init__(self, topic: str, **kwargs) -> None: # type: ignore[no-untyped-def] super().__init__(topic) + self._kwargs = kwargs self.shm = PickleSharedMemory(**kwargs) def __reduce__(self): # type: ignore[no-untyped-def] - return (pSHMTransport, (self.topic,)) + # Preserve sizing options such as default_capacity when the coordinator + # sends this transport to workers or to Rerun. + return (pSHMTransport, (self.topic,), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + self._kwargs = state + self.shm = PickleSharedMemory(**state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: @@ -193,14 +202,23 @@ def stop(self) -> None: class SHMTransport(PubSubTransport[T]): + """Raw bytes shared-memory transport for local fixed-size payloads.""" + _started: bool = False def __init__(self, topic: str, **kwargs) -> None: # type: ignore[no-untyped-def] super().__init__(topic) + self._kwargs = kwargs self.shm = BytesSharedMemory(**kwargs) def __reduce__(self): # type: ignore[no-untyped-def] - return (SHMTransport, (self.topic,)) + # Preserve sizing options such as default_capacity when the coordinator + # sends this transport to workers or to Rerun. + return (SHMTransport, (self.topic,), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + self._kwargs = state + self.shm = BytesSharedMemory(**state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: @@ -223,6 +241,8 @@ def stop(self) -> None: class JpegShmTransport(PubSubTransport[T]): + """JPEG-compressed shared-memory transport for local image streams.""" + _started: bool = False def __init__(self, topic: str, quality: int = 75, **kwargs) -> None: # type: ignore[no-untyped-def] @@ -233,9 +253,19 @@ def __init__(self, topic: str, quality: int = 75, **kwargs) -> None: # type: ig self.shm = JpegSharedMemory(quality=quality, **kwargs) self.quality = quality + self._kwargs = kwargs def __reduce__(self): # type: ignore[no-untyped-def] - return (JpegShmTransport, (self.topic, self.quality)) + # Preserve quality and sizing options when crossing worker boundaries. + return (JpegShmTransport, (self.topic, self.quality), self._kwargs) + + def __setstate__(self, state: dict[str, Any]) -> None: + from dimos.protocol.pubsub.impl.jpeg_shm import ( + JpegSharedMemory, + ) # deferred to avoid pulling in Image/cv2/rerun + + self._kwargs = state + self.shm = JpegSharedMemory(quality=self.quality, **state) def broadcast(self, _, msg) -> None: # type: ignore[no-untyped-def] if not self._started: diff --git a/dimos/models/vl/create.py b/dimos/models/vl/create.py index 362a95b000..f57a113b02 100644 --- a/dimos/models/vl/create.py +++ b/dimos/models/vl/create.py @@ -29,3 +29,7 @@ def create(name: VlModelName) -> VlModel: from dimos.models.vl.moondream import MoondreamVlModel return MoondreamVlModel() + case "gemini": + from dimos.models.vl.gemini import GeminiVlModel + + return GeminiVlModel() diff --git a/dimos/models/vl/gemini.py b/dimos/models/vl/gemini.py new file mode 100644 index 0000000000..0aecf6e1c7 --- /dev/null +++ b/dimos/models/vl/gemini.py @@ -0,0 +1,87 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import cached_property +import os +from typing import TYPE_CHECKING, Any +import warnings + +import numpy as np +from PIL import Image as PILImage + +from dimos.models.vl.base import VlModel, VlModelConfig +from dimos.msgs.sensor_msgs.Image import Image + +if TYPE_CHECKING: + from google import genai + + +class GeminiVlModelConfig(VlModelConfig): + """Configuration for the Gemini VL model.""" + + model_name: str = "gemini-2.5-flash" + api_key: str | None = None + + +class GeminiVlModel(VlModel): + """VL model backed by Google's Gemini API (google-genai). + + Reuses the same ``GOOGLE_API_KEY`` / ``GEMINI_API_KEY`` auth as the Gemini + TTS and embedding components, so no extra key or dependency is required. + """ + + config: GeminiVlModelConfig + + @cached_property + def _client(self) -> "genai.Client": + from google import genai + + api_key = ( + self.config.api_key or os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY") + ) + if not api_key: + raise ValueError( + "Gemini VL model requires GOOGLE_API_KEY or GEMINI_API_KEY to be set" + ) + + return genai.Client(api_key=api_key) + + def _to_pil(self, image: Image | np.ndarray[Any, Any]) -> PILImage.Image: + """Convert dimos Image or numpy array to PIL Image, applying auto_resize.""" + if isinstance(image, np.ndarray): + warnings.warn( + "GeminiVlModel.query should receive standard dimos Image type, not a numpy array", + DeprecationWarning, + stacklevel=2, + ) + image = Image.from_numpy(image) + + image, _ = self._prepare_image(image) + rgb_image = image.to_rgb() + return PILImage.fromarray(rgb_image.data) + + def query(self, image: Image | np.ndarray, query: str) -> str: # type: ignore[override] + pil_image = self._to_pil(image) + + response = self._client.models.generate_content( + model=self.config.model_name, + contents=[pil_image, query], + ) + + return response.text or "" + + def stop(self) -> None: + """Release the Gemini client.""" + if "_client" in self.__dict__: + del self.__dict__["_client"] diff --git a/dimos/models/vl/types.py b/dimos/models/vl/types.py index d20a61ae37..7aa42762fe 100644 --- a/dimos/models/vl/types.py +++ b/dimos/models/vl/types.py @@ -14,4 +14,4 @@ from typing import Literal -VlModelName = Literal["qwen", "moondream"] +VlModelName = Literal["qwen", "moondream", "gemini"] diff --git a/dimos/navigation/frontier_exploration/frontier_explorer_spec.py b/dimos/navigation/frontier_exploration/frontier_explorer_spec.py new file mode 100644 index 0000000000..b08cfcb1cd --- /dev/null +++ b/dimos/navigation/frontier_exploration/frontier_explorer_spec.py @@ -0,0 +1,24 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Protocol + +from dimos.spec.utils import Spec + + +# Module-ref spec for querying frontier-exploration state. Resolves (structurally) +# to WavefrontFrontierExplorer, the only deployed module exposing +# is_exploration_active(). Lets a skill gate work on whether exploration is running. +class FrontierExplorerSpec(Spec, Protocol): + def is_exploration_active(self) -> bool: ... diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 3d101cca79..196c84dd94 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -94,6 +94,7 @@ "unitree-g1-sim": "dimos.robot.unitree.g1.blueprints.perceptive.unitree_g1_sim:unitree_g1_sim", "unitree-go2": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2:unitree_go2", "unitree-go2-agentic": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic:unitree_go2_agentic", + "unitree-go2-agentic-gemini": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_gemini:unitree_go2_agentic_gemini", "unitree-go2-agentic-huggingface": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_huggingface:unitree_go2_agentic_huggingface", "unitree-go2-agentic-ollama": "dimos.robot.unitree.go2.blueprints.agentic.unitree_go2_agentic_ollama:unitree_go2_agentic_ollama", "unitree-go2-basic": "dimos.robot.unitree.go2.blueprints.basic.unitree_go2_basic:unitree_go2_basic", @@ -125,10 +126,13 @@ all_modules = { "alfred-high-level": "dimos.robot.diy.alfred.effector_high_level.AlfredHighLevel", "arm-teleop-module": "dimos.teleop.quest.quest_extensions.ArmTeleopModule", + "audio-ws-module": "dimos.web.audio_ws_module.AudioWsModule", "b-box-navigation-module": "dimos.navigation.bbox_navigation.BBoxNavigationModule", "b1-connection-module": "dimos.robot.unitree.b1.connection.B1ConnectionModule", + "camera-mjpeg-module": "dimos.web.mjpeg_module.CameraMjpegModule", "camera-module": "dimos.hardware.sensors.camera.module.CameraModule", "cartesian-motion-controller": "dimos.manipulation.control.servo_control.cartesian_motion_controller.CartesianMotionController", + "cmd-bridge-module": "dimos.web.cmd_bridge_module.CmdBridgeModule", "control-coordinator": "dimos.control.coordinator.ControlCoordinator", "cost-mapper": "dimos.mapping.costmapper.CostMapper", "demo-calculator-skill": "dimos.agents.skills.demo_calculator_skill.DemoCalculatorSkill", @@ -149,6 +153,7 @@ "g1-high-level-web-rtc": "dimos.robot.unitree.g1.effectors.high_level.webrtc.G1HighLevelWebRtc", "g1-sim-connection": "dimos.robot.unitree.g1.mujoco_sim.G1SimConnection", "g1-whole-body-connection": "dimos.robot.unitree.g1.wholebody_connection.G1WholeBodyConnection", + "gemini-speak-skill": "dimos.agents.skills.gemini_speak_skill.GeminiSpeakSkill", "go2-connection": "dimos.robot.unitree.go2.connection.GO2Connection", "go2-fleet-connection": "dimos.robot.unitree.go2.fleet_connection.Go2FleetConnection", "go2-memory": "dimos.robot.unitree.go2.blueprints.smart.unitree_go2.Go2Memory", @@ -162,8 +167,10 @@ "keyboard-teleop": "dimos.robot.unitree.keyboard_teleop.KeyboardTeleop", "keyboard-teleop-module": "dimos.teleop.keyboard.keyboard_teleop_module.KeyboardTeleopModule", "local-planner": "dimos.navigation.nav_stack.modules.local_planner.local_planner.LocalPlanner", + "local-speak-skill": "dimos.agents.skills.local_speak_skill.LocalSpeakSkill", "manipulation-module": "dimos.manipulation.manipulation_module.ManipulationModule", "map": "dimos.robot.unitree.type.map.Map", + "map-uploader": "dimos.agents.skills.map_uploader.MapUploader", "marker-tf-module": "dimos.perception.fiducial.marker_tf_module.MarkerTfModule", "mcp-client": "dimos.agents.mcp.mcp_client.McpClient", "mcp-server": "dimos.agents.mcp.mcp_server.McpServer", @@ -205,6 +212,7 @@ "simple-planner": "dimos.navigation.nav_stack.modules.simple_planner.simple_planner.SimplePlanner", "spatial-memory": "dimos.perception.spatial_perception.SpatialMemory", "speak-skill": "dimos.agents.skills.speak_skill.SpeakSkill", + "take-picture-skill": "dimos.agents.skills.take_picture_skill.TakePictureSkill", "tare-planner": "dimos.navigation.nav_stack.modules.tare_planner.tare_planner.TarePlanner", "temporal-memory": "dimos.perception.experimental.temporal_memory.temporal_memory.TemporalMemory", "terrain-analysis": "dimos.navigation.nav_stack.modules.terrain_analysis.terrain_analysis.TerrainAnalysis", diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py index 4cbb8e1153..8a8504f95a 100644 --- a/dimos/robot/cli/dimos.py +++ b/dimos/robot/cli/dimos.py @@ -422,6 +422,75 @@ def _on_sigint(_sig: int, _frame: object) -> None: typer.echo(format_line(line, json_output=json_output)) +@main.command("prefetch-models") +def prefetch_models( + skip_lfs: bool = typer.Option(False, "--skip-lfs", help="Skip git-LFS model weight pulls"), + skip_whisper: bool = typer.Option(False, "--skip-whisper", help="Skip the Whisper STT model"), +) -> None: + """Pre-download local ML models so they don't cold-download mid-mission. + + Warms the HuggingFace cache (Moondream, CLIP), pulls git-LFS model weights + (YOLO, EdgeTAM, etc.) and the Whisper STT model. Safe to re-run — already + cached models are quick no-ops. Run this once after `uv sync` or before a + mission to avoid the 120s tool-call timeout caused by a 3.85GB Moondream + download happening lazily inside `look_out_for`. + """ + results: list[tuple[str, bool]] = [] + + def _run(label: str, fn: Any) -> None: + typer.echo(f"→ {label} ...") + try: + fn() + results.append((label, True)) + typer.echo(f" ✓ {label}") + except Exception as e: # noqa: BLE001 - report and continue per model + results.append((label, False)) + typer.echo(f" ✗ {label}: {e}", err=True) + + # HuggingFace models: instantiate + start() forces from_pretrained() to download. + def _warm_vl(name: str) -> None: + from dimos.models.vl.create import create + + create(name).start() + + def _warm_clip() -> None: + from dimos.models.embedding.clip import CLIPModel + + CLIPModel().start() + + _run("VL: moondream (vikhyatk/moondream2)", lambda: _warm_vl("moondream")) + _run("Embedding: CLIP (openai/clip-vit-base-patch32)", _warm_clip) + + # git-LFS model weights (pulled into the repo data dir on first access). + if not skip_lfs: + from dimos.utils.data import get_data + + for category in ( + "models_yolo", + "models_yoloe", + "models_edgetam", + "models_mobileclip", + "models_torchreid", + "models_clip", + ): + _run(f"LFS: {category}", lambda c=category: get_data(c)) + + # Whisper STT (matches node_whisper's preferred openai-whisper backend). + if not skip_whisper: + + def _warm_whisper() -> None: + import whisper + + whisper.load_model("base") + + _run("STT: whisper base", _warm_whisper) + + ok = sum(1 for _, success in results if success) + typer.echo(f"\nPrefetch complete: {ok}/{len(results)} succeeded.") + if ok != len(results): + raise typer.Exit(1) + + mcp_app = typer.Typer(help="Interact with the running MCP server") main.add_typer(mcp_app, name="mcp") diff --git a/dimos/robot/test_all_blueprints_generation.py b/dimos/robot/test_all_blueprints_generation.py index d8b0081d7f..d141f4e3ed 100644 --- a/dimos/robot/test_all_blueprints_generation.py +++ b/dimos/robot/test_all_blueprints_generation.py @@ -32,7 +32,14 @@ "dimos/core/blueprints.py", "dimos/core/test_blueprints.py", } -BLUEPRINT_METHODS = {"transports", "global_config", "remappings", "requirements", "configurators"} +BLUEPRINT_METHODS = { + "transports", + "global_config", + "remappings", + "requirements", + "configurators", + "disabled_modules", +} _EXCLUDED_MODULE_NAMES = {"Module", "ModuleBase", "StreamModule"} diff --git a/dimos/robot/unitree/connection.py b/dimos/robot/unitree/connection.py index 44101cc19d..a1fb5e55a7 100644 --- a/dimos/robot/unitree/connection.py +++ b/dimos/robot/unitree/connection.py @@ -15,9 +15,13 @@ import asyncio from dataclasses import dataclass import functools +import io +import os +import tempfile import threading import time from typing import Any, TypeAlias, TypeVar +import wave import numpy as np from numpy.typing import NDArray @@ -54,6 +58,15 @@ VideoMessage: TypeAlias = NDArray[np.uint8] # Shape: (height, width, 3) +@dataclass +class AudioMessage: + """PCM audio chunk received from the robot mic.""" + + data: bytes # int16 little-endian PCM + sample_rate: int + channels: int + + _T = TypeVar("_T", bound=Timestamped) @@ -405,6 +418,102 @@ def switch_video_channel_off() -> None: return subject.pipe(ops.finally_action(stop)) + @simple_mcache + def audio_stream(self) -> Observable[AudioMessage]: + """Subscribe to the robot's mic via WebRTC. + + Emits int16 PCM chunks. The Go2's WebRTC audio transceiver is sendrecv; + first frame triggers channel activation. + """ + subject: Subject[AudioMessage] = Subject() + stop_event = threading.Event() + + async def accept_track(frame: Any) -> None: + if stop_event.is_set(): + return + try: + arr = frame.to_ndarray() + rate = getattr(frame, "sample_rate", 48000) + # aiortc returns shape (channels, samples) for planar; assume mono if 1-D. + channels = arr.shape[0] if arr.ndim == 2 else 1 + if np.issubdtype(arr.dtype, np.floating): + arr = np.clip(arr * 32767.0, -32768, 32767).astype(np.int16) + elif arr.dtype != np.int16: + arr = arr.astype(np.int16) + subject.on_next( + AudioMessage(data=arr.tobytes(), sample_rate=rate, channels=channels) + ) + except Exception: + pass + + self.conn.audio.add_track_callback(accept_track) + + def switch_on() -> None: + try: + self.conn.audio.switchAudioChannel(True) + except Exception: + pass + + self.loop.call_soon_threadsafe(switch_on) + + def stop() -> None: + stop_event.set() + try: + self.conn.audio.track_callbacks.remove(accept_track) + except ValueError: + pass + + def switch_off() -> None: + try: + self.conn.audio.switchAudioChannel(False) + except Exception: + pass + + self.loop.call_soon_threadsafe(switch_off) + + return subject.pipe(ops.finally_action(stop)) + + def play_wav_bytes(self, wav: bytes) -> None: + """Play a WAV through the Go2 speaker via the megaphone path. + + Fire-and-forget — returns immediately. The Go2 enters megaphone mode + for the duration of the clip then exits, so subsequent commands are + unaffected. + """ + if not wav: + return + + try: + with wave.open(io.BytesIO(wav), "rb") as wf: + duration = wf.getnframes() / float(wf.getframerate()) + except (wave.Error, EOFError): + duration = 5.0 # unknown format; bail out conservatively + + async def _upload_play_exit() -> None: + from unitree_webrtc_connect.webrtc_audiohub import WebRTCAudioHub + + tmp_path: str | None = None + try: + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: + fp.write(wav) + tmp_path = fp.name + + hub = WebRTCAudioHub(self.conn) + await hub.upload_megaphone(tmp_path) + await hub.enter_megaphone() + # Hold megaphone for the clip's duration, plus a small flush margin, + # then release so other commands work normally. + await asyncio.sleep(duration + 0.5) + await hub.exit_megaphone() + finally: + if tmp_path is not None: + try: + os.unlink(tmp_path) + except OSError: + pass + + asyncio.run_coroutine_threadsafe(_upload_play_exit(), self.loop) + def get_video_stream(self, fps: int = 30) -> Observable[Image]: """Get the video stream from the robot's camera. diff --git a/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py new file mode 100644 index 0000000000..1a5e435223 --- /dev/null +++ b/dimos/robot/unitree/go2/blueprints/agentic/unitree_go2_agentic_gemini.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.agents.mcp.mcp_client import McpClient +from dimos.agents.mcp.mcp_server import McpServer +from dimos.agents.skills.gemini_speak_skill import GeminiSpeakSkill +from dimos.agents.skills.navigation import NavigationSkillContainer +from dimos.agents.skills.person_follow import PersonFollowSkillContainer +from dimos.agents.skills.map_uploader import MapUploader +from dimos.agents.skills.speak_skill import SpeakSkill +from dimos.agents.skills.take_picture_skill import TakePictureSkill +from dimos.core.coordination.blueprints import autoconnect +from dimos.experimental.security_demo.security_module import SecurityModule +from dimos.perception.spatial_perception import SpatialMemory +from dimos.robot.unitree.go2.blueprints.agentic._common_agentic import _common_agentic +from dimos.robot.unitree.go2.blueprints.smart.unitree_go2_spatial import unitree_go2_spatial +from dimos.robot.unitree.go2.connection import GO2Connection + +# Disabled for a no-OpenAI / non-CUDA setup: +# - SecurityModule needs a CUDA GPU (EdgeTAM) -> won't boot on Apple Silicon. +# - SpeakSkill (TTS) hardcodes OpenAI -> needs OPENAI_API_KEY; replaced below +# by GeminiSpeakSkill, which uses the Gemini TTS API (reuses GOOGLE_API_KEY, +# no GPU, cross-platform so it can also run on the robot). +# SpatialMemory is re-declared to use remote Gemini embeddings instead of the +# local CLIP model (which fails on Apple CoreML); the later atom overrides the +# one inside unitree_go2_spatial. +# ALL compute runs on the Mac (the dog has no onboard brain), so VL detection is +# ALSO Gemini — no local Moondream (~6 min/inference + crashes Metal on Apple +# Silicon, which aborted the blueprint at startup): +# - .global_config(detection_model="gemini") drives look_out_for / PerceiveLoop. +# - NavigationSkillContainer + PersonFollowSkillContainer use vl_model_name="gemini" +# (these read the per-skill name, not the global knob). +unitree_go2_agentic_gemini = ( + autoconnect( + unitree_go2_spatial, + SpatialMemory.blueprint(embedding_model="gemini", embedding_dimensions=768), + McpServer.blueprint(), + McpClient.blueprint(model="google_genai:gemini-2.5-flash"), + _common_agentic, + NavigationSkillContainer.blueprint(vl_model_name="gemini"), + PersonFollowSkillContainer.blueprint( + camera_info=GO2Connection.camera_info_static, + vl_model_name="gemini", + ), + GeminiSpeakSkill.blueprint(), + TakePictureSkill.blueprint(), + MapUploader.blueprint(), + ) + .global_config(detection_model="gemini") + .disabled_modules(SecurityModule, SpeakSkill) +) + +__all__ = ["unitree_go2_agentic_gemini"] diff --git a/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py b/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py index 96a291163d..ea315119c5 100644 --- a/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py +++ b/dimos/robot/unitree/go2/blueprints/basic/unitree_go2_basic.py @@ -14,29 +14,47 @@ # See the License for the specific language governing permissions and # limitations under the License. -import platform from typing import Any -from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE +from dimos.constants import ( + DEFAULT_CAPACITY_COLOR_IMAGE, + DEFAULT_CAPACITY_OCCUPANCY_GRID, + DEFAULT_CAPACITY_POINTCLOUD, +) from dimos.core.coordination.blueprints import autoconnect from dimos.core.global_config import global_config from dimos.core.transport import pSHMTransport +from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 from dimos.robot.unitree.go2.connection import GO2Connection from dimos.visualization.vis_module import vis_module -# Mac has some issue with high bandwidth UDP, so we use pSHMTransport for color_image -# actually we can use pSHMTransport for all platforms, and for all streams -# TODO need a global transport toggle on blueprints/global config -_mac_transports: dict[tuple[str, type], pSHMTransport[Image]] = { +# Route large local replay and mapping streams through SHM on every platform. +# Small control/status streams continue to use the default LCM transport. +_local_high_bandwidth_transports: dict[tuple[str, type], pSHMTransport[Any]] = { ("color_image", Image): pSHMTransport( - "color_image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE + "/color_image", default_capacity=DEFAULT_CAPACITY_COLOR_IMAGE + ), + ("lidar", PointCloud2): pSHMTransport("/lidar", default_capacity=DEFAULT_CAPACITY_POINTCLOUD), + ("pointcloud", PointCloud2): pSHMTransport( + "/pointcloud", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("global_map", PointCloud2): pSHMTransport( + "/global_map", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("merged_map", PointCloud2): pSHMTransport( + "/merged_map", default_capacity=DEFAULT_CAPACITY_POINTCLOUD + ), + ("global_costmap", OccupancyGrid): pSHMTransport( + "/global_costmap", default_capacity=DEFAULT_CAPACITY_OCCUPANCY_GRID + ), + ("navigation_costmap", OccupancyGrid): pSHMTransport( + "/navigation_costmap", default_capacity=DEFAULT_CAPACITY_OCCUPANCY_GRID ), } -_transports_base = ( - autoconnect() if platform.system() == "Linux" else autoconnect().transports(_mac_transports) -) +_transports_base = autoconnect().transports(_local_high_bandwidth_transports) def _convert_camera_info(camera_info: Any) -> Any: diff --git a/dimos/robot/unitree/go2/connection.py b/dimos/robot/unitree/go2/connection.py index 5568a473ef..e472c39bdb 100644 --- a/dimos/robot/unitree/go2/connection.py +++ b/dimos/robot/unitree/go2/connection.py @@ -48,7 +48,7 @@ from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo from dimos.msgs.sensor_msgs.Image import Image from dimos.msgs.sensor_msgs.PointCloud2 import PointCloud2 -from dimos.robot.unitree.connection import UnitreeWebRTCConnection +from dimos.robot.unitree.connection import AudioMessage, UnitreeWebRTCConnection from dimos.utils.decorators.decorators import cached_property, simple_mcache if sys.version_info < (3, 13): @@ -207,6 +207,8 @@ class GO2Connection(Module, Camera, Pointcloud): lidar: Out[PointCloud2] color_image: Out[Image] camera_info: Out[CameraInfo] + audio: Out[AudioMessage] + audio_in: In[bytes] connection: Go2ConnectionProtocol camera_info_static: CameraInfo = _camera_info_static() @@ -246,6 +248,19 @@ def onimage(image: Image) -> None: self.register_disposable(self.connection.video_stream().subscribe(onimage)) self.register_disposable(Disposable(self.cmd_vel.subscribe(self.move))) + if hasattr(self.connection, "audio_stream"): + try: + self.register_disposable( + self.connection.audio_stream().subscribe(self.audio.publish) + ) + except Exception as e: + logger.warning(f"audio_stream not started: {e}") + + if hasattr(self.connection, "play_wav_bytes"): + self.register_disposable( + Disposable(self.audio_in.subscribe(self.connection.play_wav_bytes)) + ) + self._camera_info_thread = Thread( target=self.publish_camera_info, daemon=True, @@ -352,6 +367,35 @@ def publish_request(self, topic: str, data: dict[str, Any]) -> dict[Any, Any]: """ return self.connection.publish_request(topic, data) + @skill + def play_wav(self, wav_path: str) -> str: + """Play a WAV file through the robot's speaker (megaphone mode). + + Args: + wav_path: Path on the dimos host filesystem. Use `play_wav_b64` + instead when the agent runs on a different machine. + """ + if not hasattr(self.connection, "play_wav_bytes"): + return "play_wav unavailable on this connection (simulation?)" + with open(wav_path, "rb") as f: + self.connection.play_wav_bytes(f.read()) + return f"queued {wav_path} for playback" + + @skill + def play_wav_b64(self, wav_b64: str) -> str: + """Play a base64-encoded WAV through the robot's speaker. + + Use this from remote MCP clients that don't share the dimos host's + filesystem. 44.1 kHz mono WAVs play most reliably. + """ + import base64 + + if not hasattr(self.connection, "play_wav_bytes"): + return "play_wav unavailable on this connection (simulation?)" + wav = base64.b64decode(wav_b64) + self.connection.play_wav_bytes(wav) + return f"queued {len(wav)} bytes for playback" + @skill def observe(self) -> Image | None: """Returns the latest video frame from the robot camera. Use this skill for any visual world queries. diff --git a/dimos/robot/unitree/tilt_spec.py b/dimos/robot/unitree/tilt_spec.py new file mode 100644 index 0000000000..a1ef528c74 --- /dev/null +++ b/dimos/robot/unitree/tilt_spec.py @@ -0,0 +1,26 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Protocol + +from dimos.spec.utils import Spec + + +# Module-ref spec for tilting the robot body. Resolves (structurally) to +# UnitreeSkillContainer.tilt_body, letting another module (e.g. TakePictureSkill) +# aim the body-fixed camera without owning the WebRTC connection. +class TiltSpec(Spec, Protocol): + def tilt_body( + self, pitch_deg: float = 0.0, roll_deg: float = 0.0, yaw_deg: float = 0.0 + ) -> str: ... diff --git a/dimos/robot/unitree/unitree_skill_container.py b/dimos/robot/unitree/unitree_skill_container.py index 88194473e6..160a1b268a 100644 --- a/dimos/robot/unitree/unitree_skill_container.py +++ b/dimos/robot/unitree/unitree_skill_container.py @@ -19,7 +19,7 @@ import math import time -from unitree_webrtc_connect.constants import RTC_TOPIC +from unitree_webrtc_connect.constants import RTC_TOPIC, SPORT_CMD from dimos.agents.annotation import skill from dimos.core.core import rpc @@ -300,6 +300,37 @@ def execute_sport_command(self, command_name: str) -> str: logger.error(f"Failed to execute {command_name}: {e}") return "Failed to execute the command." + @skill + def tilt_body( + self, pitch_deg: float = 0.0, roll_deg: float = 0.0, yaw_deg: float = 0.0 + ) -> str: + """Tilt the robot's body to aim its (fixed) camera up or down. + + The Go2's camera is body-mounted, so "looking up/down" means pitching the + whole body. NEGATIVE pitch_deg looks UP, positive looks DOWN. Useful range + is about -40..40 degrees; roll/yaw are optional. The robot must be standing + first (run StandUp or BalanceStand). This is a held pose — call tilt_body() + with no args to return level. + + Example: tilt_body(pitch_deg=-20) # look up at e.g. a tabletop + """ + # Clamp to the Go2's safe standing envelope (radians). + roll = max(-0.75, min(0.75, math.radians(float(roll_deg)))) + pitch = max(-0.75, min(0.75, math.radians(float(pitch_deg)))) + yaw = max(-0.6, min(0.6, math.radians(float(yaw_deg)))) + try: + self._connection.publish_request( + RTC_TOPIC["SPORT_MOD"], + { + "api_id": SPORT_CMD["Euler"], + "parameter": {"data": {"x": roll, "y": pitch, "z": yaw}}, + }, + ) + return f"Body tilted (pitch={pitch_deg}, roll={roll_deg}, yaw={yaw_deg} deg)." + except Exception as e: + logger.error(f"Failed to tilt body: {e}") + return "Failed to tilt the body." + _commands = "\n".join( [f'- "{name}": {description}' for name, (_, description) in _UNITREE_COMMANDS.items()] diff --git a/dimos/stream/audio/tts/node_gemini.py b/dimos/stream/audio/tts/node_gemini.py new file mode 100644 index 0000000000..6563fb6f9d --- /dev/null +++ b/dimos/stream/audio/tts/node_gemini.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import threading +import time + +from google import genai +from google.genai import types +import numpy as np +from reactivex import Observable, Subject + +from dimos.stream.audio.base import ( + AbstractAudioEmitter, + AudioEvent, +) +from dimos.stream.audio.text.base import AbstractTextConsumer, AbstractTextEmitter +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + +# Gemini TTS returns single-channel 16-bit PCM at 24 kHz, little-endian. +_SAMPLE_RATE = 24000 + + +class GeminiTTSNode(AbstractTextConsumer, AbstractAudioEmitter, AbstractTextEmitter): + """ + A text-to-speech node that consumes text, emits audio using Google's Gemini TTS API, and passes through text. + + This node implements AbstractTextConsumer to receive text input, AbstractAudioEmitter + to provide audio output, and AbstractTextEmitter to pass through the text being spoken, + allowing it to be inserted into a text-to-audio pipeline with text passthrough capabilities. + + Mirrors OpenAITTSNode but uses the Gemini API directly (google-genai), so it + needs no OpenAI key and reuses the GOOGLE_API_KEY already used for the LLM + and embeddings. Only the ``*-preview-tts`` models support audio output. + """ + + def __init__( + self, + api_key: str | None = None, + voice: str = "Kore", + model: str = "gemini-2.5-flash-preview-tts", + instruction: str = "Read the following text aloud verbatim", + ) -> None: + """ + Initialize GeminiTTSNode. + + Args: + api_key: Gemini API key (if None, reads GOOGLE_API_KEY / GEMINI_API_KEY) + voice: Prebuilt Gemini voice name (e.g. "Kore", "Puck", "Charon") + model: Gemini TTS model (must be a ``*-preview-tts`` model) + instruction: Style directive prepended to each utterance. Gemini + narrates only the text after it (not the directive itself); it + also stops the model from "answering" short prompts instead of + reading them. Set to "" to send the raw text unmodified. + """ + self.voice = voice + self.model = model + self.instruction = instruction + + api_key = api_key or os.getenv("GOOGLE_API_KEY") or os.getenv("GEMINI_API_KEY") + if not api_key: + raise ValueError("Gemini TTS requires GEMINI_API_KEY or GOOGLE_API_KEY to be set") + self.client = genai.Client(api_key=api_key) + + # Initialize state + self.audio_subject = Subject() # type: ignore[var-annotated] + self.text_subject = Subject() # type: ignore[var-annotated] + self.subscription = None + self.processing_thread = None + self.is_running = True + self.text_queue = [] # type: ignore[var-annotated] + self.queue_lock = threading.Lock() + + def emit_audio(self) -> Observable: # type: ignore[type-arg] + """ + Returns an observable that emits audio frames. + + Returns: + Observable emitting AudioEvent objects + """ + return self.audio_subject + + def emit_text(self) -> Observable: # type: ignore[type-arg] + """ + Returns an observable that emits the text being spoken. + + Returns: + Observable emitting text strings + """ + return self.text_subject + + def consume_text(self, text_observable: Observable) -> "AbstractTextConsumer": # type: ignore[type-arg] + """ + Start consuming text from the observable source. + + Args: + text_observable: Observable source of text strings + + Returns: + Self for method chaining + """ + # Idempotent: start the worker + subscription only once. This used to be + # called per utterance, leaking a new thread + subscription each time + # (and re-logging "Starting GeminiTTSNode" on every speak). If called + # again with a new source, just re-point the subscription. + if self.processing_thread is not None and self.processing_thread.is_alive(): + if self.subscription is not None: + self.subscription.dispose() + self.subscription = text_observable.subscribe( # type: ignore[assignment] + on_next=self._queue_text, + on_error=lambda e: logger.error(f"Error in GeminiTTSNode: {e}"), + ) + return self + + logger.info("Starting GeminiTTSNode") + + # Start the processing thread + self.processing_thread = threading.Thread(target=self._process_queue, daemon=True) # type: ignore[assignment] + self.processing_thread.start() # type: ignore[attr-defined] + + # Subscribe to the text observable + self.subscription = text_observable.subscribe( # type: ignore[assignment] + on_next=self._queue_text, + on_error=lambda e: logger.error(f"Error in GeminiTTSNode: {e}"), + ) + + return self + + def _queue_text(self, text: str) -> None: + """ + Add text to the processing queue. + + Args: + text: The text to synthesize + """ + if not text.strip(): + return + + with self.queue_lock: + self.text_queue.append(text) + + def _process_queue(self) -> None: + """Background thread to process the text queue.""" + while self.is_running: + # Check if there's text to process + text_to_process = None + with self.queue_lock: + if self.text_queue: + text_to_process = self.text_queue.pop(0) + + if text_to_process: + self._synthesize_speech(text_to_process) + else: + # Sleep a bit to avoid busy-waiting + time.sleep(0.1) + + def _synthesize_speech(self, text: str) -> None: + """ + Convert text to speech using the Gemini TTS API. + + Args: + text: The text to synthesize + """ + try: + contents = f"{self.instruction}: {text}" if self.instruction else text + response = self.client.models.generate_content( + model=self.model, + contents=contents, + config=types.GenerateContentConfig( + response_modalities=["AUDIO"], + speech_config=types.SpeechConfig( + voice_config=types.VoiceConfig( + prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=self.voice) + ) + ), + ), + ) + self.text_subject.on_next(text) + + # Gemini returns raw 16-bit PCM bytes (24 kHz, mono, little-endian). + pcm_bytes = response.candidates[0].content.parts[0].inline_data.data + audio_array = np.frombuffer(pcm_bytes, dtype=np.int16) + + audio_event = AudioEvent( + data=audio_array, + sample_rate=_SAMPLE_RATE, + timestamp=time.time(), + channels=1, + ) + + self.audio_subject.on_next(audio_event) + + except Exception as e: + logger.error(f"Error synthesizing speech: {e}") + + def dispose(self) -> None: + """Clean up resources.""" + logger.info("Disposing GeminiTTSNode") + + self.is_running = False + + # Clear pending items so the thread doesn't start new synthesis. + with self.queue_lock: + self.text_queue.clear() + + if self.processing_thread and self.processing_thread.is_alive(): + self.processing_thread.join(timeout=2.0) + + if self.subscription: + self.subscription.dispose() + self.subscription = None + + # Complete the subjects + self.audio_subject.on_completed() + self.text_subject.on_completed() + + +if __name__ == "__main__": + from dimos.stream.audio.node_output import SounddeviceAudioOutput + from dimos.stream.audio.text.node_stdout import TextPrinterNode + from dimos.stream.audio.utils import keepalive + + # Create a simple text subject that we can push values to + text_subject = Subject() # type: ignore[var-annotated] + + tts_node = GeminiTTSNode(voice="Kore") + tts_node.consume_text(text_subject) + + # Create and connect an audio output node - match Gemini's 24 kHz output + audio_output = SounddeviceAudioOutput(sample_rate=_SAMPLE_RATE) + audio_output.consume_audio(tts_node.emit_audio()) + + stdout = TextPrinterNode(prefix="[Spoken Text] ") + stdout.consume_text(tts_node.emit_text()) + + test_messages = [ + "Hello!", + "This is a test of the Gemini text to speech system.", + ] + + print("Starting Gemini TTS test...") + print("-" * 60) + + for message in test_messages: + text_subject.on_next(message) + + keepalive() diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index 2f5fb1efa9..eb00f23730 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -43,6 +43,7 @@ from dimos.core.core import rpc from dimos.core.module import Module, ModuleConfig +from dimos.core.transport import PubSubTransport from dimos.protocol.pubsub.impl.lcmpubsub import LCM from dimos.protocol.pubsub.patterns import Glob, pattern_matches from dimos.protocol.pubsub.spec import SubscribeAllCapable @@ -164,7 +165,10 @@ def _default_blueprint() -> Blueprint: class Config(ModuleConfig): + # Pubsubs cover discoverable sources such as LCM. visual_transports is + # populated by the coordinator for concrete local streams such as SHM. pubsubs: list[SubscribeAllCapable[Any, Any]] = field(default_factory=lambda: [LCM()]) + visual_transports: list[PubSubTransport[Any]] = field(default_factory=list) visual_override: dict[Glob | str, Callable[[Any], Archetype] | None] = field( default_factory=dict @@ -186,10 +190,12 @@ class Config(ModuleConfig): class RerunBridgeModule(Module): - """Bridge that logs messages from pubsubs to Rerun. + """Bridge that logs transport messages to Rerun. - Spawns its own Rerun viewer and subscribes to all topics on each provided - pubsub. Any message that has a to_rerun() method is automatically logged. + Spawns its own Rerun viewer and subscribes to configured pubsubs and + explicit visual transports. Pubsubs cover discoverable transports such as + LCM; visual_transports covers concrete local transports such as SHM. + Any message that has a to_rerun() method is automatically logged. Example: from dimos.protocol.pubsub.impl.lcmpubsub import LCM @@ -215,6 +221,8 @@ def __init__(self, **kwargs: Any) -> None: self._last_log = {} self._override_cache: dict[str, Callable[[Any], RerunData | None]] = {} self._frame_attached: dict[str, str] = {} + self._subscribed_visual_transport_topics: set[str] = set() + self._started = False @property def host(self) -> str: @@ -265,13 +273,56 @@ def composed(msg: Any) -> RerunData | None: return composed def _get_entity_path(self, topic: Any) -> str: + """Map a transport topic to a Rerun entity path. + + LCM topics usually already include a leading slash and a type suffix. + SHM topics are plain strings. Normalize both forms so visual overrides + such as "world/color_image" match consistently. + """ if self.config.topic_to_entity: return self.config.topic_to_entity(topic) topic_str = getattr(topic, "name", None) or str(topic) topic_str = topic_str.split("#")[0] # strip LCM topic suffix + if not topic_str.startswith("/"): + topic_str = f"/{topic_str}" return f"{self.config.entity_prefix}{topic_str}" + @rpc + def set_visual_transports(self, transports: list[PubSubTransport[Any]]) -> None: + """Replace explicit visual transports and subscribe when running. + + The coordinator calls this after stream wiring and after loading + additional blueprints into an existing coordinator. + """ + self.config.visual_transports = transports + if self._started: + self._subscribe_visual_transports() + + def _subscribe_visual_transports(self) -> None: + """Attach to configured SHM streams once per topic.""" + for transport in self.config.visual_transports: + topic = str(getattr(transport, "topic", "")) + if not topic or topic in self._subscribed_visual_transport_topics: + continue + self._subscribed_visual_transport_topics.add(topic) + if hasattr(transport, "start"): + transport.start() + # If subscribe raises, the bridge still owns cleanup for the transport it started. + self.register_disposable(Disposable(transport.stop)) + transport_topic = getattr(transport, "topic", topic) + + def on_visual_message(msg: Any, transport_topic: Any = transport_topic) -> None: + self._on_message(msg, transport_topic) + + unsub = transport.subscribe( + # Capture the current topic so callbacks keep the correct + # entity path even as this loop advances to the next transport. + on_visual_message + ) + if unsub is not None: + self.register_disposable(Disposable(unsub)) + def _on_message(self, msg: Any, topic: Any) -> None: """Handle incoming message - log to rerun.""" @@ -306,6 +357,7 @@ def _on_message(self, msg: Any, topic: Any) -> None: @rpc def start(self) -> None: super().start() + self._started = True logger.info("Rerun bridge starting") @@ -397,6 +449,8 @@ def start(self) -> None: unsub = pubsub.subscribe_all(self._on_message) self.register_disposable(Disposable(unsub)) + self._subscribe_visual_transports() + for pubsub in self.config.pubsubs: if hasattr(pubsub, "stop"): self.register_disposable(Disposable(pubsub.stop)) # type: ignore[union-attr] @@ -506,8 +560,10 @@ def log_blueprint_graph(self, dot_code: str, module_names: list[str]) -> None: @rpc def stop(self) -> None: + self._started = False self._override_cache.clear() self._frame_attached.clear() + self._subscribed_visual_transport_topics.clear() super().stop() diff --git a/dimos/web/audio_ws_module.py b/dimos/web/audio_ws_module.py new file mode 100644 index 0000000000..8f54d0a57f --- /dev/null +++ b/dimos/web/audio_ws_module.py @@ -0,0 +1,169 @@ +"""Bridge robot audio to a web client. + + - WebSocket /audio_out -> sends int16 PCM frames from the Go2 mic + - GET /audio_info -> reports the current sample_rate + channels (so + consumers know how to decode /audio_out frames) + - POST /play -> uploads a WAV body, plays it on the Go2 speaker + +Compose with `unitree-go2-basic`: + + dimos run unitree-go2-basic audio-ws-module + +Then: + ws://127.0.0.1:7781/audio_out (mic, binary frames) + curl http://127.0.0.1:7781/audio_info (sample_rate, channels) + curl --data-binary @clip.wav http://127.0.0.1:7781/play +""" + +import asyncio +import queue +import threading +from typing import Any + +from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect +from fastapi.middleware.cors import CORSMiddleware +import uvicorn + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.robot.unitree.connection import AudioMessage +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class AudioWsConfig(ModuleConfig): + port: int = 7781 + + +class AudioWsModule(Module): + config: AudioWsConfig + audio: In[AudioMessage] # robot mic (from GO2Connection.audio) + audio_in: Out[bytes] # WAV bytes for robot speaker (-> GO2Connection.audio_in) + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._app = FastAPI() + self._app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["GET", "POST"], + allow_headers=["*"], + ) + # Cross-thread handoff: subscriber thread enqueues, broadcast task drains. + self._frame_queue: queue.Queue[bytes] = queue.Queue(maxsize=200) + self._clients: set[WebSocket] = set() + self._server: uvicorn.Server | None = None + self._thread: threading.Thread | None = None + self._last_format: dict[str, int] | None = None + self._stop_event = threading.Event() + self._setup_routes() + + def _setup_routes(self) -> None: + @self._app.get("/audio_info") + def audio_info() -> dict[str, Any]: + if self._last_format is None: + return {"status": "no frame yet"} + return {"status": "ok", **self._last_format} + + @self._app.websocket("/audio_out") + async def audio_out(ws: WebSocket) -> None: + await ws.accept() + if self._last_format is not None: + await ws.send_json({"event": "format", **self._last_format}) + self._clients.add(ws) + try: + while True: + # Server -> client only; sink any inbound messages. + await ws.receive() + except WebSocketDisconnect: + pass + finally: + self._clients.discard(ws) + + @self._app.post("/play") + async def play(req: Request) -> dict[str, str]: + wav = await req.body() + if not wav: + return {"status": "empty"} + self.audio_in.publish(wav) + return {"status": "queued", "bytes": str(len(wav))} + + @self._app.on_event("startup") + async def _spawn_broadcaster() -> None: + asyncio.create_task(self._broadcast_loop()) + + async def _broadcast_loop(self) -> None: + """Drain the cross-thread queue and fan out to WebSocket clients.""" + loop = asyncio.get_running_loop() + while not self._stop_event.is_set(): + try: + chunk = await loop.run_in_executor(None, self._frame_queue.get, True, 0.5) + except queue.Empty: + continue + if not self._clients: + continue + stale: list[WebSocket] = [] + for ws in list(self._clients): + try: + await ws.send_bytes(chunk) + except Exception: + stale.append(ws) + for ws in stale: + self._clients.discard(ws) + + @rpc + def start(self) -> None: + super().start() + + self.audio.subscribe(self._on_audio) + + def run() -> None: + config = uvicorn.Config( + self._app, + host="127.0.0.1", + port=self.config.port, + log_level="warning", + lifespan="on", + ) + self._server = uvicorn.Server(config) + try: + # Server.run() owns its own asyncio loop and cleanup; this + # avoids the run_until_complete/run_coroutine_threadsafe race + # that produced "Event loop stopped before Future completed". + self._server.run() + except OSError as e: + logger.error( + f"audio-ws failed to bind :{self.config.port} ({e}); " + f"is another instance running? `lsof -ti :{self.config.port} | xargs kill -9`" + ) + except Exception: + logger.exception("audio-ws server crashed") + + self._thread = threading.Thread(target=run, daemon=True, name="audio-ws-uvicorn") + self._thread.start() + logger.info( + f"audio-ws-module: ws://127.0.0.1:{self.config.port}/audio_out " + f"| GET http://127.0.0.1:{self.config.port}/audio_info " + f"| POST http://127.0.0.1:{self.config.port}/play" + ) + + def _on_audio(self, msg: AudioMessage) -> None: + self._last_format = {"sample_rate": msg.sample_rate, "channels": msg.channels} + try: + self._frame_queue.put_nowait(msg.data) + except queue.Full: + # Drop oldest to bound latency. + try: + self._frame_queue.get_nowait() + self._frame_queue.put_nowait(msg.data) + except queue.Empty: + pass + + @rpc + def stop(self) -> None: + self._stop_event.set() + if self._server is not None: + self._server.should_exit = True + super().stop() diff --git a/dimos/web/cmd_bridge_module.py b/dimos/web/cmd_bridge_module.py new file mode 100644 index 0000000000..266808bfa9 --- /dev/null +++ b/dimos/web/cmd_bridge_module.py @@ -0,0 +1,222 @@ +"""HTTP bridge for driving the robot from an external process. + + - POST /cmd_vel -> publish a single Twist for `duration` seconds (then 0) + - POST /path -> execute a sequence of relative moves in order + - POST /stop -> emergency zero-Twist + - GET /pose -> latest base_link pose in the world frame + +Compose with any blueprint that has GO2Connection (sim or real robot): + + dimos --simulation run unitree-go2-basic cmd-bridge-module + +The "path" is a list of relative steps that are sent as raw cmd_vel +Twists for a per-step duration, then a stop. This is open-loop — no +SLAM, no obstacle avoidance — but works in sim and on the bare-metal +robot without requiring the nav stack. Good fit for a VLM that decides +the next short step from a camera frame and re-plans every iteration. +""" + +import asyncio +import math +import queue +import threading +import time +from typing import Any + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field +import uvicorn + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Twist import Twist +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class CmdBridgeConfig(ModuleConfig): + port: int = 7782 + + +class TwistRequest(BaseModel): + linear: list[float] = Field(default=[0.0, 0.0, 0.0], min_length=3, max_length=3) + angular: list[float] = Field(default=[0.0, 0.0, 0.0], min_length=3, max_length=3) + duration: float = 0.5 # seconds to hold the command before stopping + + +class PathStep(BaseModel): + """One step of an open-loop path. + + `linear`/`angular` are the same Twist components as `/cmd_vel`. The step + runs for `duration` seconds, then the next step starts. If the model + prefers semantic steps, use `forward`/`left`/`degrees` and let the bridge + convert. + """ + + linear: list[float] | None = None + angular: list[float] | None = None + # Semantic alternative: distances in meters, rotation in degrees, + # executed over `duration` seconds. Converted to Twist by dividing. + forward: float | None = None + left: float | None = None + degrees: float | None = None + duration: float = 1.0 + + +class PathRequest(BaseModel): + steps: list[PathStep] + + +class CmdBridgeModule(Module): + """Open-loop HTTP -> cmd_vel bridge.""" + + config: CmdBridgeConfig + cmd_vel: Out[Twist] + odom: In[PoseStamped] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._app = FastAPI() + self._app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["GET", "POST"], + allow_headers=["*"], + ) + self._latest_pose: PoseStamped | None = None + self._server: uvicorn.Server | None = None + self._thread: threading.Thread | None = None + # Serialize POSTs so /path doesn't interleave with /cmd_vel. + self._drive_lock = threading.Lock() + # Cooperative cancel for an in-flight /path when /stop arrives. + self._cancel_event = threading.Event() + self._setup_routes() + + def _setup_routes(self) -> None: + @self._app.get("/pose") + def pose() -> dict[str, Any]: + if self._latest_pose is None: + return {"status": "no pose yet"} + p = self._latest_pose + q = p.orientation + # Yaw from quaternion (assumes near-zero roll/pitch, fine for a quadruped). + siny = 2.0 * (q.w * q.z + q.x * q.y) + cosy = 1.0 - 2.0 * (q.y * q.y + q.z * q.z) + theta = math.atan2(siny, cosy) + return { + "status": "ok", + "x": p.position.x, + "y": p.position.y, + "z": p.position.z, + "theta": theta, + "ts": p.ts, + } + + @self._app.post("/cmd_vel") + def cmd_vel(req: TwistRequest) -> dict[str, Any]: + with self._drive_lock: + self._cancel_event.clear() + self._publish_twist(req.linear, req.angular) + self._sleep_or_cancel(req.duration) + self._publish_twist([0, 0, 0], [0, 0, 0]) + return {"status": "ok"} + + @self._app.post("/path") + def path(req: PathRequest) -> dict[str, Any]: + with self._drive_lock: + self._cancel_event.clear() + executed = 0 + for step in req.steps: + if self._cancel_event.is_set(): + break + linear, angular = _step_to_twist(step) + self._publish_twist(linear, angular) + if self._sleep_or_cancel(step.duration): + break + executed += 1 + self._publish_twist([0, 0, 0], [0, 0, 0]) + return { + "status": "cancelled" if self._cancel_event.is_set() else "ok", + "executed": executed, + "total": len(req.steps), + } + + @self._app.post("/stop") + def stop() -> dict[str, str]: + self._cancel_event.set() + self._publish_twist([0, 0, 0], [0, 0, 0]) + return {"status": "stopped"} + + def _publish_twist(self, linear: list[float], angular: list[float]) -> None: + self.cmd_vel.publish( + Twist( + linear=Vector3(linear[0], linear[1], linear[2]), + angular=Vector3(angular[0], angular[1], angular[2]), + ts=time.time(), + ) + ) + + def _sleep_or_cancel(self, duration: float) -> bool: + """Sleep up to `duration`, returns True if cancelled mid-sleep.""" + return self._cancel_event.wait(timeout=max(0.0, duration)) + + def _on_odom(self, pose: PoseStamped) -> None: + self._latest_pose = pose + + @rpc + def start(self) -> None: + super().start() + self.register_disposable(self.odom.subscribe(self._on_odom)) + + def run() -> None: + config = uvicorn.Config( + self._app, + host="127.0.0.1", + port=self.config.port, + log_level="warning", + lifespan="on", + ) + self._server = uvicorn.Server(config) + try: + self._server.run() + except OSError as e: + logger.error( + f"cmd-bridge failed to bind :{self.config.port} ({e}); " + f"another instance? `lsof -ti :{self.config.port} | xargs kill -9`" + ) + except Exception: + logger.exception("cmd-bridge server crashed") + + self._thread = threading.Thread(target=run, daemon=True, name="cmd-bridge-uvicorn") + self._thread.start() + logger.info( + f"cmd-bridge-module: POST http://127.0.0.1:{self.config.port}/cmd_vel " + f"| POST /path | POST /stop | GET /pose" + ) + + @rpc + def stop(self) -> None: + self._cancel_event.set() + self._publish_twist([0, 0, 0], [0, 0, 0]) + if self._server is not None: + self._server.should_exit = True + super().stop() + + +def _step_to_twist(step: PathStep) -> tuple[list[float], list[float]]: + """Convert a PathStep to (linear, angular) Twist components.""" + if step.linear is not None or step.angular is not None: + return ( + step.linear or [0.0, 0.0, 0.0], + step.angular or [0.0, 0.0, 0.0], + ) + dur = max(step.duration, 1e-3) + fwd = (step.forward or 0.0) / dur + lat = (step.left or 0.0) / dur + yaw = math.radians(step.degrees or 0.0) / dur + return [fwd, lat, 0.0], [0.0, 0.0, yaw] diff --git a/dimos/web/mjpeg_module.py b/dimos/web/mjpeg_module.py new file mode 100644 index 0000000000..ccac9200f8 --- /dev/null +++ b/dimos/web/mjpeg_module.py @@ -0,0 +1,97 @@ +"""Expose color_image as an MJPEG HTTP stream + single-frame snapshot. + +Usage: + dimos --simulation run unitree-go2-basic camera-mjpeg-module + # MJPEG: http://127.0.0.1:7780/video_feed/color_image + # snapshot: http://127.0.0.1:7780/snapshot/color_image +""" + +import threading +from typing import Any + +import cv2 +from fastapi import HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import Response +import numpy as np +import reactivex as rx +from reactivex import operators as ops + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In +from dimos.msgs.sensor_msgs.Image import Image +from dimos.utils.logging_config import setup_logger +from dimos.web.robot_web_interface import RobotWebInterface + +logger = setup_logger() + + +class CameraMjpegConfig(ModuleConfig): + port: int = 7780 + stream_key: str = "color_image" + + +class CameraMjpegModule(Module): + config: CameraMjpegConfig + color_image: In[Image] + + def __init__(self, **kwargs: Any) -> None: + super().__init__(**kwargs) + self._subject: rx.subject.Subject = rx.subject.Subject() + self._web: RobotWebInterface | None = None + self._thread: threading.Thread | None = None + self._latest_jpeg: bytes | None = None + self._latest_lock = threading.Lock() + + @rpc + def start(self) -> None: + super().start() + self._web = RobotWebInterface( + port=self.config.port, + **{self.config.stream_key: self._subject.pipe(ops.share())}, + ) + self._web.app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["GET"], + allow_headers=["*"], + ) + + stream_key = self.config.stream_key + + @self._web.app.get(f"/snapshot/{stream_key}") + def snapshot() -> Response: + with self._latest_lock: + buf = self._latest_jpeg + if buf is None: + raise HTTPException(status_code=503, detail="no frame yet") + return Response(content=buf, media_type="image/jpeg") + + self._thread = threading.Thread(target=self._web.run, daemon=True) + self._thread.start() + self.color_image.subscribe(self._on_image) + logger.info( + f"MJPEG: http://127.0.0.1:{self.config.port}/video_feed/{stream_key} " + f"| snapshot: http://127.0.0.1:{self.config.port}/snapshot/{stream_key}" + ) + + def _on_image(self, img: Image) -> None: + arr = img.as_numpy() + if arr.ndim == 3 and arr.shape[2] == 3: + arr = cv2.cvtColor(arr, cv2.COLOR_RGB2BGR) + self._subject.on_next(arr) + + ok, jpg = cv2.imencode(".jpg", arr) + if ok: + with self._latest_lock: + self._latest_jpeg = np.asarray(jpg).tobytes() + + @rpc + def stop(self) -> None: + if self._web is not None: + try: + self._web.shutdown() + except Exception: + pass + super().stop() diff --git a/go2-start.sh b/go2-start.sh new file mode 100755 index 0000000000..180cd6a77c --- /dev/null +++ b/go2-start.sh @@ -0,0 +1,235 @@ +#!/usr/bin/env bash +# go2-start.sh — hackathon quickstart for the dimairos05 Go2 +# +# Usage: +# ./go2-start.sh # unitree-go2-basic + web bridges +# ./go2-start.sh unitree-go2-agentic-gemini # any blueprint +# ROBOT_IP=10.0.0.42 ./go2-start.sh # override IP +# BRIDGES=0 ./go2-start.sh # skip camera-mjpeg + audio-ws +# EXTRA="foo-module bar-module" ./go2-start.sh # tack on more modules +# SIMULATION=1 ./go2-start.sh # MuJoCo sim (camera only, no audio) +# +# Local LLM presets — pick exactly one (default model can be overridden): +# LMSTUDIO=1 ./go2-start.sh unitree-go2-agentic +# -> OpenAI-compat at http://127.0.0.1:1234/v1 +# -> LMSTUDIO_MODEL=qwen/qwen3-8b (override if a different model is loaded) +# +# MLXVLM=1 ./go2-start.sh unitree-go2-agentic +# -> OpenAI-compat at http://127.0.0.1:8080/v1 (mlxvlm Gemma-4 server) +# -> MLXVLM_MODEL=lmstudio-community/gemma-4-E4B-it-MLX-4bit +# +# Default extras (with BRIDGES=1): +# camera-mjpeg-module http://127.0.0.1:7780/video_feed/color_image +# http://127.0.0.1:7780/snapshot/color_image +# audio-ws-module ws://127.0.0.1:7781/audio_out (robot only) +# POST http://127.0.0.1:7781/play (robot only) + +set -euo pipefail + +# ---- config ---------------------------------------------------------------- +ROBOT_IP="${ROBOT_IP:-192.168.12.1}" +EXPECTED_SSID="${EXPECTED_SSID:-dimairos05}" +BLUEPRINT="${1:-unitree-go2-basic}" +VENV_DIR="${VENV_DIR:-.venv}" +BRIDGES="${BRIDGES:-1}" +EXTRA="${EXTRA:-}" +SIMULATION="${SIMULATION:-0}" +LMSTUDIO="${LMSTUDIO:-0}" +MLXVLM="${MLXVLM:-0}" +LMSTUDIO_MODEL="${LMSTUDIO_MODEL:-qwen/qwen3-8b}" +MLXVLM_MODEL="${MLXVLM_MODEL:-lmstudio-community/gemma-4-E4B-it-MLX-4bit}" +# ---------------------------------------------------------------------------- + +c_red() { printf '\033[31m%s\033[0m\n' "$*"; } +c_green() { printf '\033[32m%s\033[0m\n' "$*"; } +c_yellow(){ printf '\033[33m%s\033[0m\n' "$*"; } +c_bold() { printf '\033[1m%s\033[0m\n' "$*"; } + +# Sanity: only one LLM preset at a time. +if [[ "$LMSTUDIO" == "1" && "$MLXVLM" == "1" ]]; then + c_red "✗ pick one: LMSTUDIO=1 or MLXVLM=1, not both" + exit 1 +fi + +# Optional LLM endpoint override + model selection for the McpClient. +LLM_ARGS=() +LLM_NAME="" +LLM_URL="" +LLM_MODEL="" +if [[ "$LMSTUDIO" == "1" ]]; then + LLM_NAME="LM Studio" + LLM_URL="http://127.0.0.1:1234/v1" + LLM_MODEL="$LMSTUDIO_MODEL" +elif [[ "$MLXVLM" == "1" ]]; then + LLM_NAME="mlxvlm (Gemma 4)" + LLM_URL="http://127.0.0.1:8080/v1" + LLM_MODEL="$MLXVLM_MODEL" +fi + +# When an LLM preset is set but the chosen blueprint has no McpClient, +# auto-inject mcp-server + mcp-client so the override actually binds. +# (Mirrors what sim-with-llm.sh does, so real-robot LLM mode behaves the +# same as sim.) +if [[ -n "$LLM_URL" && "$BLUEPRINT" != *"agentic"* ]]; then + if [[ -z "$EXTRA" ]]; then + EXTRA="mcp-server mcp-client" + elif [[ "$EXTRA" != *"mcp-client"* ]]; then + EXTRA="$EXTRA mcp-server mcp-client" + fi +fi + +# Assemble the module list once so it's reused for echo + exec. +MODULES=("$BLUEPRINT") +if [[ "$BRIDGES" == "1" ]]; then + MODULES+=("camera-mjpeg-module" "audio-ws-module" "cmd-bridge-module") +fi +if [[ -n "$EXTRA" ]]; then + # shellcheck disable=SC2206 + MODULES+=( $EXTRA ) +fi + +if [[ -n "$LLM_URL" ]]; then + export OPENAI_BASE_URL="$LLM_URL" + export OPENAI_API_KEY="${OPENAI_API_KEY:-local-llm}" # placeholder; servers ignore it + LLM_ARGS=( -o "mcpclient.model=openai:$LLM_MODEL" ) +fi + +c_bold "▶ Go2 hackathon quickstart" +echo " modules : ${MODULES[*]}" +if [[ "$SIMULATION" == "1" ]]; then + echo " mode : simulation (no robot)" +else + echo " robot ip : $ROBOT_IP" +fi +if [[ -n "$LLM_NAME" ]]; then + echo " LLM : $LLM_NAME → $LLM_URL" + echo " model : $LLM_MODEL" +fi +echo + +if [[ "$SIMULATION" != "1" ]]; then + # 1. Wifi check (macOS) ---------------------------------------------------- + if [[ "$(uname)" == "Darwin" ]]; then + SSID="$(ipconfig getsummary en0 2>/dev/null | awk -F' SSID : ' '/ SSID : /{print $2; exit}')" + if [[ -z "$SSID" ]]; then + SSID="$(ipconfig getsummary en1 2>/dev/null | awk -F' SSID : ' '/ SSID : /{print $2; exit}')" + fi + if [[ "$SSID" == "$EXPECTED_SSID" ]]; then + c_green "✓ on wifi $SSID" + else + c_yellow "⚠ current wifi is '${SSID:-unknown}', expected '$EXPECTED_SSID'" + c_yellow " if the robot is on a different network, set ROBOT_IP and ignore this" + fi + else + c_yellow "⚠ non-macOS host, skipping wifi check" + fi + + # 2. Reachability ---------------------------------------------------------- + echo + echo "→ pinging $ROBOT_IP …" + if ping -c 3 -W 1000 "$ROBOT_IP" >/dev/null 2>&1; then + c_green "✓ robot reachable" + else + c_red "✗ cannot reach $ROBOT_IP" + c_red " check: joined dimairos05? robot powered on? sport mode on in the Unitree app?" + c_red " (use SIMULATION=1 ./go2-start.sh to run MuJoCo instead)" + exit 1 + fi + + # 3. Clock sync (Unitree video desyncs without this) ----------------------- + echo + echo "→ syncing clock (sudo, may prompt for password) …" + if command -v sntp >/dev/null 2>&1; then + sudo sntp -sS pool.ntp.org >/dev/null 2>&1 \ + && c_green "✓ clock synced" \ + || c_yellow "⚠ clock sync failed (non-fatal, but video may lag lidar)" + elif command -v ntpdate >/dev/null 2>&1; then + sudo ntpdate pool.ntp.org >/dev/null 2>&1 \ + && c_green "✓ clock synced" \ + || c_yellow "⚠ clock sync failed (non-fatal)" + else + c_yellow "⚠ no sntp/ntpdate found, skipping clock sync" + fi +fi + +# 4. LLM endpoint reachability check ---------------------------------------- +if [[ -n "$LLM_URL" ]]; then + echo + echo "→ probing $LLM_NAME at $LLM_URL/models …" + if curl -fsS -m 3 "$LLM_URL/models" >/dev/null 2>&1; then + c_green "✓ $LLM_NAME reachable" + else + c_red "✗ cannot reach $LLM_NAME at $LLM_URL" + if [[ "$LMSTUDIO" == "1" ]]; then + c_red " start LM Studio's Local Server (Cmd-Shift-2) and load a tool-capable model" + else + c_red " start mlxvlm: cd /Users/tex/repos/ai/mlx/mlxvlm && scripts/start-all.sh" + fi + exit 1 + fi +fi + +# 5. Venv -------------------------------------------------------------------- +echo +if [[ -d "$VENV_DIR" ]]; then + c_green "✓ using existing venv $VENV_DIR" +else + c_yellow "no venv at $VENV_DIR — creating one" + if ! command -v uv >/dev/null 2>&1; then + c_red "✗ uv is not installed. install: brew install uv (or curl -LsSf https://astral.sh/uv/install.sh | sh)" + exit 1 + fi + uv venv --python 3.12 "$VENV_DIR" + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + if [[ "$SIMULATION" == "1" ]]; then + uv pip install 'dimos[base,unitree,sim]' + else + uv pip install 'dimos[base,unitree]' + fi +fi +# shellcheck disable=SC1091 +source "$VENV_DIR/bin/activate" + +# 6. Env vars ---------------------------------------------------------------- +export ROBOT_IP +# Agentic-gemini blueprint needs these; warn if missing for that path +if [[ "$BLUEPRINT" == *"gemini"* && -z "${GOOGLE_API_KEY:-}" ]]; then + c_yellow "⚠ GOOGLE_API_KEY not set — gemini blueprint will fail when the agent runs" +fi +if [[ -z "$LLM_URL" && "$BLUEPRINT" == *"agentic"* && "$BLUEPRINT" != *"ollama"* && "$BLUEPRINT" != *"gemini"* ]]; then + if [[ -z "${OPENAI_API_KEY:-}" ]]; then + c_yellow "⚠ OPENAI_API_KEY not set — agentic blueprint will fail when the agent runs" + c_yellow " (or set LMSTUDIO=1 / MLXVLM=1 to use a local LLM)" + fi +fi + +# 7. Launch ------------------------------------------------------------------ +echo +c_bold "▶ endpoints" +echo " command center : http://localhost:7779" +if [[ "$BRIDGES" == "1" ]]; then + echo " camera MJPEG : http://127.0.0.1:7780/video_feed/color_image" + echo " camera snapshot: http://127.0.0.1:7780/snapshot/color_image" + if [[ "$SIMULATION" == "1" ]]; then + echo " audio : (robot only — sim has no audio)" + else + echo " audio out (ws) : ws://127.0.0.1:7781/audio_out" + echo " audio info : http://127.0.0.1:7781/audio_info" + echo " audio play : POST http://127.0.0.1:7781/play" + fi + echo " cmd_vel : POST http://127.0.0.1:7782/cmd_vel" + echo " path : POST http://127.0.0.1:7782/path" + echo " stop : POST http://127.0.0.1:7782/stop" + echo " pose : http://127.0.0.1:7782/pose" +fi +echo +c_bold "▶ launching: dimos $([[ "$SIMULATION" == "1" ]] && echo "--simulation ")run ${MODULES[*]} ${LLM_ARGS[*]:-}" +echo " ctrl-c to stop" +echo + +if [[ "$SIMULATION" == "1" ]]; then + exec dimos --simulation run "${MODULES[@]}" "${LLM_ARGS[@]}" +else + exec dimos run "${MODULES[@]}" "${LLM_ARGS[@]}" +fi diff --git a/journal/2026-05-27-camera-audio-bridges.md b/journal/2026-05-27-camera-audio-bridges.md new file mode 100644 index 0000000000..9c03d04122 --- /dev/null +++ b/journal/2026-05-27-camera-audio-bridges.md @@ -0,0 +1,163 @@ +# 2026-05-27 — camera + audio web bridges, Go2 audio scaffold + +Branch: `feat/gemini-go2-2245` + +## Goal +Stream the Go2 (sim or real) camera into a web page consumable by mlxvlm, +then add the same shape for robot audio (mic out, speaker in). + +## Camera + +### `dimos/web/mjpeg_module.py` (new) — `camera-mjpeg-module` +- Module subscribes to `color_image: In[Image]` and republishes via FastAPI on + `:7780`. +- Endpoints: + - `GET /video_feed/color_image` — MJPEG (`multipart/x-mixed-replace`) + - `GET /snapshot/color_image` — single JPEG (for backend pull, e.g. mlxvlm + `/api/analyze` server-side) +- CORS allow-all so a browser at a different origin can `drawImage` onto a + canvas without tainting. +- Default port moved from 5555 → 7780 to avoid common collisions. + +### `dimos/robot/all_blueprints.py` +- Registered `"camera-mjpeg-module": "dimos.web.mjpeg_module.CameraMjpegModule"`. + +### Use +``` +dimos --simulation run unitree-go2-basic camera-mjpeg-module +# http://127.0.0.1:7780/video_feed/color_image +# http://127.0.0.1:7780/snapshot/color_image +``` +Same URLs work against the real Go2 — drop `--simulation`, set `ROBOT_IP`. + +## Audio (real-robot only — sim has no audio) + +### `dimos/robot/unitree/connection.py` (modified) +- Added `AudioMessage(data: bytes, sample_rate: int, channels: int)` dataclass. +- `UnitreeWebRTCConnection.audio_stream()` — subscribes to + `LegionConnection.audio` via `add_track_callback`, calls + `switchAudioChannel(True)`, emits `AudioMessage` (int16 PCM at the frame's + native rate, usually 48 kHz mono). +- `UnitreeWebRTCConnection.play_wav_bytes(wav: bytes)` — writes a tempfile, + uploads via `WebRTCAudioHub.upload_megaphone`, then `enter_megaphone`. Fires + on the connection's asyncio loop; returns immediately. + +### `dimos/robot/unitree/go2/connection.py` (modified) +- New streams on `GO2Connection`: + - `audio: Out[bytes]` — mic data (currently only the PCM bytes — see + review issue #1, metadata is dropped). + - `audio_in: In[bytes]` — WAV bytes to play through the speaker. +- `start()` wires both if the connection supports them (`hasattr` guard for + sim). +- New skill `play_wav(wav_path: str)` — agent-facing, reads local WAV and + invokes `play_wav_bytes`. + +### `dimos/web/audio_ws_module.py` (new) — `audio-ws-module` +- Module with `audio: In[bytes]` (mic) and `audio_in: Out[bytes]` (speaker). +- FastAPI on `:7781`: + - `WebSocket /audio_out` — pushes binary PCM frames to connected clients. + - `POST /play` — body = WAV bytes; publishes to `audio_in` for the robot + speaker. +- CORS open on the HTTP routes. + +### `dimos/robot/all_blueprints.py` +- Registered `"audio-ws-module": "dimos.web.audio_ws_module.AudioWsModule"`. + +### Use (real Go2 only) +``` +dimos run unitree-go2-basic audio-ws-module +# ws://127.0.0.1:7781/audio_out (mic out, binary) +# curl --data-binary @clip.wav http://127.0.0.1:7781/play +``` + +## Other changes + +### `go2-start.sh` +- Quickstart script for the dimairos05 Go2 — wifi check, ping, NTP sync, venv + bootstrap, env-var sanity, `exec dimos run `. Useful at the + hackathon table. + +### Editable install +- Switched the venv from a one-shot `uv pip install 'dimos[sim] @ .'` to + `uv pip install -e '.[sim]'` so source edits apply without reinstall. + +## Review fixes applied +- `GO2Connection.audio` retyped `Out[AudioMessage]`; rate + channels carry + through to `audio-ws-module`, which now exposes them via `GET /audio_info` and + sends an initial `{"event":"format", ...}` JSON frame on WS connect. +- `play_wav_bytes` reads WAV duration via `wave`, sleeps for it, then calls + `exit_megaphone` so the robot doesn't stay in megaphone mode. +- Float frames from aiortc are now scaled `(arr * 32767).clip(...)` before + the int16 cast instead of being silently truncated. +- Added `play_wav_b64` skill alongside `play_wav` for remote MCP clients + that don't share the dimos host's filesystem. +- Moved `os` / `tempfile` / `wave` imports to module top. + +## Known issues / follow-ups +1. Stereo packed-vs-planar layout — `arr.shape[0]` assumes planar; Go2 mic + is mono so OK in practice. Document, don't fix yet. +2. `audio-ws-module` alone (without `unitree-go2-basic`) has an unbound `In` — + autoconnect doesn't warn loudly enough. +3. Untested: no real Go2 available during this session. Sim has no audio. + +## Mac + local-LLM agentic — what works, what doesn't + +Goal: drive an MCP agent on Apple Silicon with LM Studio (:1234) or the +mlxvlm Gemma-4 server (:8080), no CUDA, no Google services. + +### Blueprints + +| Blueprint | Boots on Mac? | Notes | +|---|---|---| +| `unitree-go2-agentic` | ✗ | imports SecurityModule (EdgeTAM/CUDA) and Moondream VL (crashes Metal) | +| `unitree-go2-agentic-gemini` | ✗ as-is | imports `GeminiSpeakSkill` → `from google import genai` at module load. Run `uv pip install google-genai` first, even if you don't call any Gemini APIs. | +| `unitree-go2-agentic-ollama` | ✓ | clean compose; `./sim-with-llm.sh ollama` uses it | +| `unitree-go2-basic + mcp-server + mcp-client` | ✓ | minimal but reliable; default for `./sim-with-llm.sh lmstudio` and `mlxvlm` | + +### Skill availability by compose + +| Compose | Skills the agent gets | +|---|---| +| `unitree-go2-basic + mcp-server + mcp-client` | `observe`, `play_wav`, `play_wav_b64` (last two no-op in sim) | +| `… + unitree-skill-container + replanning-a-star-planner` | adds `relative_move`, `wait`, `current_time`, `tilt_body`, `execute_sport_command` — heavier startup (full nav stack) | + +`unitree-skill-container` alone fails build: +`"No module met NavigationInterfaceSpec spec"`. The implementor is +`replanning-a-star-planner`. + +### VL backends — Mac gotcha + +`dimos/models/vl/types.py:17` literal: `qwen | moondream | gemini`. + +- `qwen` — **not local**: hits Alibaba DashScope cloud (`dashscope-intl.aliyuncs.com`), needs `ALIBABA_API_KEY`. +- `moondream` — local; crashes Metal on Apple Silicon (per the `-gemini` blueprint docstring). +- `gemini` — Google cloud, needs `GOOGLE_API_KEY`. + +There is currently **no Mac-local VL backend**. The user has mlxvlm on :8080 +(Gemma-4-E4B + Falcon-Perception, OpenAI-compatible) — natural place to plug +a new `mlxvlm` / `openai_compat` backend. Not implemented yet; needs a +new file under `dimos/models/vl/`. + +### LLM chat backends + +- **LM Studio at :1234** and **mlxvlm at :8080** both speak `/v1/chat/completions` + with proper OpenAI tool-call passthrough — `langchain-openai` (already + installed) handles them via `OPENAI_BASE_URL`. No new dimos module + needed for chat. +- McpClient model override via `-o mcpclient.model=openai:` works. + Wired up in `go2-start.sh` LMSTUDIO/MLXVLM presets. + +### Next iteration +Add `mlxvlm` to `VlModelName` so `NavigationSkillContainer` and +`PersonFollowSkillContainer` can run with a local VL backend. With that ++ `google-genai` installed, `unitree-go2-agentic-gemini` becomes the +richest Mac agentic compose if we also `--disable` the Gemini-runtime +modules (`gemini-speak-skill`, `map-uploader`, `spatial-memory`). + +## Composition cheatsheet +| Run | Endpoints | +|---|---| +| `unitree-go2-basic camera-mjpeg-module` | `:7780/video_feed/color_image`, `:7780/snapshot/color_image` | +| `unitree-go2-basic audio-ws-module` | `ws://:7781/audio_out`, `POST :7781/play` | +| `unitree-go2-basic camera-mjpeg-module audio-ws-module` | both | +| `unitree-go2-agentic camera-mjpeg-module audio-ws-module` | MCP + agent + both bridges | diff --git a/journal/2026-05-27-mlxvlm-robot-integration-prompt.md b/journal/2026-05-27-mlxvlm-robot-integration-prompt.md new file mode 100644 index 0000000000..e5b0c343c1 --- /dev/null +++ b/journal/2026-05-27-mlxvlm-robot-integration-prompt.md @@ -0,0 +1,123 @@ +# mlxvlm → dimos: drive the Go2 from a VLM + +Use this as the prompt for the Claude working on `/Users/tex/repos/ai/mlx/mlxvlm`. + +## Context + +dimos is the robot OS. It runs as a separate process on the same Mac and +exposes three OpenAI/HTTP-shaped bridges on localhost so mlxvlm can plug in +without speaking any internal dimos APIs: + +| URL | Direction | Format | +|---|---|---| +| `GET http://127.0.0.1:7780/video_feed/color_image` | dimos → you | MJPEG (`multipart/x-mixed-replace; boundary=frame`) | +| `GET http://127.0.0.1:7780/snapshot/color_image` | dimos → you | one JPEG | +| `GET ws://127.0.0.1:7781/audio_out` | dimos → you | binary int16 PCM frames; first WS message is `{"event":"format","sample_rate":N,"channels":N}` | +| `GET http://127.0.0.1:7781/audio_info` | dimos → you | `{sample_rate,channels}` JSON | +| `POST http://127.0.0.1:7781/play` | you → dimos | body = raw WAV bytes; plays on robot speaker (no-op in sim) | +| `POST http://127.0.0.1:7782/cmd_vel` | you → dimos | one Twist step | +| `POST http://127.0.0.1:7782/path` | you → dimos | sequence of Twist steps, executed in order | +| `POST http://127.0.0.1:7782/stop` | you → dimos | zero-Twist emergency stop | +| `GET http://127.0.0.1:7782/pose` | dimos → you | `{x, y, z, theta, ts}` of base_link in world frame | + +All endpoints are CORS-open. Endpoints stay the same whether dimos is in +MuJoCo sim or driving a real Go2 — only audio is robot-only. + +## Twist + Path semantics + +`cmd_vel` is a standard ROS-style Twist: linear `[x_forward, y_left, z]` in +m/s, angular `[x_roll, y_pitch, z_yaw]` in rad/s. Open-loop: the bridge +publishes the Twist, sleeps for `duration` seconds, then publishes zero. + +`/path` takes a list of steps. Two ways to spell a step: + +1. **Raw Twist** — the model emits velocity components: + ```json + {"linear":[0.3,0,0],"angular":[0,0,0],"duration":1.0} + ``` + +2. **Semantic** — the model emits distances in m / rotation in deg, and the + bridge divides by `duration` to make a Twist: + ```json + {"forward":0.5,"left":0,"degrees":0,"duration":2.0} + {"forward":0,"left":0,"degrees":-90,"duration":1.5} + ``` + +Steps run sequentially. `/stop` cancels an in-flight `/path` cooperatively. + +No SLAM, no obstacle avoidance — for closed-loop, **re-plan** every iteration +from a fresh camera frame instead of sending long paths. + +## What to implement in mlxvlm + +Add a `POST /api/robot/navigate` route to `app.py` that runs a perceive-act +loop until the goal is reached, the user cancels, or `max_steps` is hit: + +``` +input: { "goal": "find the red chair and stop in front of it", + "max_steps": 12, + "step_seconds": 1.0 } + +loop: + 1. frame = httpx.get(DIMOS_CAMERA_URL).content + 2. pose = httpx.get(DIMOS_POSE_URL).json() + 3. ask Gemma-4-Falcon-Perception via the existing /api/analyze pipeline: + "Camera frame attached. Robot pose: {pose}. Goal: {goal}. + Decide the next ≤2 second movement. Reply ONLY with strict JSON: + { 'action':'move'|'stop'|'arrived', + 'forward': float meters, + 'left': float meters, + 'degrees': float rotation, + 'reason': '' } + Positive forward=ahead, positive left=left, positive degrees=CCW. + Keep |forward|≤1.0, |left|≤0.5, |degrees|≤45 per step." + 4. parse JSON; if action=='arrived' or 'stop', POST /stop and break + 5. POST /api/robot/navigate/steps a single PathStep with the chosen + values + duration=step_seconds, wait for response + 6. SSE-stream {step, action, reason, pose} so the UI shows progress + +env config (read at startup): + DIMOS_CAMERA_URL = http://127.0.0.1:7780/snapshot/color_image + DIMOS_POSE_URL = http://127.0.0.1:7782/pose + DIMOS_PATH_URL = http://127.0.0.1:7782/path + DIMOS_STOP_URL = http://127.0.0.1:7782/stop + DIMOS_PLAY_URL = http://127.0.0.1:7781/play (optional, for spoken feedback) +``` + +Failure modes the loop must handle: +- camera/pose endpoint 5xx or timeout (skip the step, log, retry once) +- JSON parse failure from the model (treat as `stop`) +- per-step Twist limits exceeded by the model (clamp, log) +- max_steps hit without `arrived` (return `{status: "exhausted", history: […]}`) + +UI (optional, in `static/`): +- "Goal" text input + "Go" button → POST /api/robot/navigate +- live event stream → render the latest camera frame, current pose, last + reason, step counter +- big red "Stop" button → POST `DIMOS_STOP_URL` directly (don't wait for + the agent) + +## Audio loop (optional follow-up) +If you want the robot to speak / listen: +- Mic: open `ws://127.0.0.1:7781/audio_out`, parse the first JSON + `format` frame, buffer ~1s of PCM, wrap in a WAV header, POST to the + existing mlx-audio whisper server on `:8000`. +- Speaker: TTS output → POST raw WAV to `http://127.0.0.1:7781/play`. + +## Things NOT to do +- Don't try to import anything from dimos. All integration is over HTTP/WS. +- Don't long-poll `/path` — it blocks until the steps finish. Use shorter + paths (1–3 steps) per loop iteration so the VLM can re-evaluate. +- Don't assume metric scale from the camera — the VLM reasons about + "ahead", "left", "approaching", not centimeters. Conservative step sizes + (0.3–0.6 m forward, ≤30° turns) are safer. + +## Testing without a real robot +The dimos sim publishes the same endpoints. Start it with: +``` +cd ~/repos/robotics/dimos +./sim-with-llm.sh mlxvlm # routes the in-sim agent's LLM to your :8080 +``` +Then in another terminal, your mlxvlm `POST /api/robot/navigate` should +drive the sim Go2 in the MuJoCo viewer, and you'll see its 3D position +change via `GET /pose`. diff --git a/pyproject.toml b/pyproject.toml index 7bc41d2cd3..ef6e6126e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -201,6 +201,8 @@ agents = [ "langchain-text-splitters>=1,<2", "langchain-huggingface>=1,<2", "langchain-ollama>=1,<2", + "langchain-google-genai>=2,<3", + "google-genai>=2,<3", "ollama>=0.6.0", "anthropic>=0.19.0", diff --git a/scripts/export_recording.py b/scripts/export_recording.py new file mode 100644 index 0000000000..7ab2bf1876 --- /dev/null +++ b/scripts/export_recording.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. + +"""Export a memory2 recording into the robomoo web app. + +Reads a SqliteStore `.db` and pushes three structured artifacts over the same +token-guarded robot-ingest endpoints the live skills use (`map_uploader.py`, +`take_picture_skill.py`): + + 1. occupancy map — top-down lidar histogram → value-encoded grayscale PNG + → POST /api/robot/map + 2. trajectory — downsampled odom path [{ts,x,y,theta}] (JSON) + → POST /api/robot/trajectory + 3. embedded frames — throttled color_image keyframes, each CLIP-embedded + (same model the web searches with) + thumbnail + pose + → POST /api/robot/frame (with an `embedding` field) + +The web then renders an interactive vector map (pan/zoom), the driven path, and +in-browser CLIP semantic search over the frames. + +Usage: + ROBOMOO_URL=http://localhost:4470 ROBOT_INGEST_TOKEN=... \ + uv run python scripts/export_recording.py recording_go2.db +""" + +from __future__ import annotations + +import argparse +import json +import math +import os + +import cv2 +import httpx +import numpy as np + +from dimos.memory2.store.sqlite import SqliteStore +from dimos.memory2.transform import downsample, throttle +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +def quat_to_yaw(qx: float, qy: float, qz: float, qw: float) -> float: + """Heading (yaw) about world +z, radians.""" + return math.atan2(2.0 * (qw * qz + qx * qy), 1.0 - 2.0 * (qy * qy + qz * qz)) + + +def quat_to_matrix(qx: float, qy: float, qz: float, qw: float) -> np.ndarray: + """3x3 rotation matrix from a (x, y, z, w) quaternion.""" + return np.array( + [ + [1 - 2 * (qy * qy + qz * qz), 2 * (qx * qy - qz * qw), 2 * (qx * qz + qy * qw)], + [2 * (qx * qy + qz * qw), 1 - 2 * (qx * qx + qz * qz), 2 * (qy * qz - qx * qw)], + [2 * (qx * qz - qy * qw), 2 * (qy * qz + qx * qw), 1 - 2 * (qx * qx + qy * qy)], + ], + dtype=np.float64, + ) + + +def post(url: str, token: str, path: str, **kwargs) -> None: + resp = httpx.post( + f"{url.rstrip('/')}{path}", + headers={"Authorization": f"Bearer {token}"}, + timeout=60.0, + **kwargs, + ) + resp.raise_for_status() + + +def build_and_push_map( + store: SqliteStore, url: str, token: str, resolution: float, every: int +) -> None: + """Accumulate lidar into a world-frame top-down occupancy and upload it.""" + if "lidar" not in store.list_streams(): + logger.warning("no lidar stream — skipping map") + return + + pts_world: list[np.ndarray] = [] + for obs in store.streams.lidar.transform(downsample(every)): + if obs.pose is None: + continue + x, y, z, qx, qy, qz, qw = obs.pose + pts = np.asarray(obs.data.points_f32(), dtype=np.float64) + if pts.size == 0: + continue + world = pts @ quat_to_matrix(qx, qy, qz, qw).T + np.array([x, y, z]) + pts_world.append(world[:, :2]) # XY only + if not pts_world: + logger.warning("no lidar points with pose — skipping map") + return + + xy = np.concatenate(pts_world, axis=0) + min_x, min_y = xy.min(axis=0) + max_x, max_y = xy.max(axis=0) + width = max(1, int(math.ceil((max_x - min_x) / resolution))) + height = max(1, int(math.ceil((max_y - min_y) / resolution))) + + cols = np.clip(((xy[:, 0] - min_x) / resolution).astype(int), 0, width - 1) + rows = np.clip(((xy[:, 1] - min_y) / resolution).astype(int), 0, height - 1) + counts = np.zeros((height, width), dtype=np.int32) + np.add.at(counts, (rows, cols), 1) + + # value-encoded grayscale (matches map_uploader.py): unknown=255, occupied=100, + # lightly-hit cells scaled, free space left unknown (transparent on the web). + enc = np.full((height, width), 255, dtype=np.uint8) + hit = counts > 0 + enc[hit] = np.clip(counts[hit] * 25, 1, 100).astype(np.uint8) + + ok, buf = cv2.imencode(".png", enc) + if not ok: + logger.warning("map PNG encode failed") + return + post( + url, + token, + "/api/robot/map", + files={"file": ("map.png", buf.tobytes(), "image/png")}, + data={ + "resolution": str(resolution), + "originX": str(min_x), + "originY": str(min_y), + "width": str(width), + "height": str(height), + }, + ) + logger.info("pushed map %dx%d (res %.3f)", width, height, resolution) + + +def push_trajectory(store: SqliteStore, url: str, token: str, interval: float) -> None: + if "odom" not in store.list_streams(): + logger.warning("no odom stream — skipping trajectory") + return + points = [] + for obs in store.streams.odom.transform(throttle(interval)): + if obs.pose is None: + continue + x, y, _z, qx, qy, qz, qw = obs.pose + points.append({"ts": obs.ts, "x": x, "y": y, "theta": quat_to_yaw(qx, qy, qz, qw)}) + if not points: + logger.warning("no odom poses — skipping trajectory") + return + blob = json.dumps(points).encode("utf-8") + post( + url, + token, + "/api/robot/trajectory", + files={"file": ("trajectory.json", blob, "application/json")}, + ) + logger.info("pushed trajectory (%d points)", len(points)) + + +def push_frames( + store: SqliteStore, url: str, token: str, interval: float, max_frames: int, thumb_w: int +) -> None: + if "color_image" not in store.list_streams(): + logger.warning("no color_image stream — skipping frames") + return + from dimos.models.embedding.clip import CLIPModel + + clip = CLIPModel() + n = 0 + for obs in store.streams.color_image.transform(throttle(interval)): + if n >= max_frames: + break + img = obs.data + emb = clip.embed(img) + vec = emb.to_numpy().astype(float).ravel().tolist() + + bgr = img.to_opencv() + h, w = bgr.shape[:2] + if w > thumb_w: + bgr = cv2.resize(bgr, (thumb_w, int(h * thumb_w / w))) + ok, buf = cv2.imencode(".jpg", bgr, [cv2.IMWRITE_JPEG_QUALITY, 80]) + if not ok: + continue + + data = {"label": "frame", "embedding": json.dumps(vec)} + if obs.pose is not None: + data["poseX"] = str(obs.pose[0]) + data["poseY"] = str(obs.pose[1]) + post( + url, + token, + "/api/robot/frame", + files={"file": ("frame.jpg", buf.tobytes(), "image/jpeg")}, + data=data, + ) + n += 1 + logger.info("pushed %d embedded frames", n) + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("db", help="path to the recording .db") + ap.add_argument("--robomoo-url", default=os.getenv("ROBOMOO_URL", "")) + ap.add_argument("--token", default=os.getenv("ROBOT_INGEST_TOKEN", "")) + ap.add_argument("--map-resolution", type=float, default=0.05) + ap.add_argument("--lidar-every", type=int, default=10, help="use every Nth lidar scan") + ap.add_argument("--traj-interval", type=float, default=0.5, help="seconds between path points") + ap.add_argument("--frame-interval", type=float, default=2.0, help="seconds between keyframes") + ap.add_argument("--max-frames", type=int, default=120) + ap.add_argument("--thumb-width", type=int, default=384) + ap.add_argument("--no-map", action="store_true") + ap.add_argument("--no-trajectory", action="store_true") + ap.add_argument("--no-frames", action="store_true") + args = ap.parse_args() + + if not args.robomoo_url or not args.token: + raise SystemExit("set ROBOMOO_URL and ROBOT_INGEST_TOKEN (or pass --robomoo-url/--token)") + + store = SqliteStore(path=args.db) + logger.info("opened %s — streams: %s", args.db, store.list_streams()) + + if not args.no_map: + build_and_push_map(store, args.robomoo_url, args.token, args.map_resolution, args.lidar_every) + if not args.no_trajectory: + push_trajectory(store, args.robomoo_url, args.token, args.traj_interval) + if not args.no_frames: + push_frames( + store, + args.robomoo_url, + args.token, + args.frame_interval, + args.max_frames, + args.thumb_width, + ) + logger.info("done") + + +if __name__ == "__main__": + main() diff --git a/sim-with-llm.sh b/sim-with-llm.sh new file mode 100755 index 0000000000..0f30e3394e --- /dev/null +++ b/sim-with-llm.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# sim-with-llm.sh — sim + local LLM, one command. +# +# Thin wrapper around go2-start.sh: sets SIMULATION=1 and the backend preset. +# go2-start.sh handles the rest (auto-injects mcp-server/mcp-client when an +# LLM preset is set against a non-agentic blueprint). +# +# Usage: +# ./sim-with-llm.sh # mlxvlm Gemma-4 (default chat LLM) +# ./sim-with-llm.sh lmstudio # LM Studio +# ./sim-with-llm.sh mlxvlm qwen3 # mlxvlm with a specific loaded model +# ./sim-with-llm.sh ollama # local ollama (uses native ollama blueprint) +# +# Override the base blueprint if you want a different shape: +# BLUEPRINT=unitree-go2-basic ./sim-with-llm.sh lmstudio # default +# BLUEPRINT=unitree-go2-agentic-ollama ./sim-with-llm.sh # full agentic compose + +set -euo pipefail + +BACKEND="${1:-mlxvlm}" +shift || true + +BLUEPRINT="${BLUEPRINT:-unitree-go2-basic}" + +case "$BACKEND" in + mlxvlm) + export MLXVLM=1 + if [[ -n "${1:-}" ]]; then export MLXVLM_MODEL="$1"; fi + ;; + lmstudio) + export LMSTUDIO=1 + if [[ -n "${1:-}" ]]; then export LMSTUDIO_MODEL="$1"; fi + ;; + ollama) + # The ollama agentic blueprint already composes mcp-server/mcp-client. + BLUEPRINT="unitree-go2-agentic-ollama" + ;; + *) + echo "unknown backend: $BACKEND (use mlxvlm | lmstudio | ollama)" >&2 + exit 2 + ;; +esac + +exec env SIMULATION=1 ./go2-start.sh "$BLUEPRINT" diff --git a/uv.lock b/uv.lock index 55a2997df3..01c45ff589 100644 --- a/uv.lock +++ b/uv.lock @@ -1883,9 +1883,11 @@ dependencies = [ agents = [ { name = "anthropic" }, { name = "faster-whisper" }, + { name = "google-genai" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -1913,6 +1915,7 @@ all = [ { name = "ffmpeg-python" }, { name = "filterpy" }, { name = "gdown" }, + { name = "google-genai" }, { name = "googlemaps" }, { name = "gtsam-extended" }, { name = "hydra-core" }, @@ -1922,6 +1925,7 @@ all = [ { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -2004,11 +2008,13 @@ base = [ { name = "faster-whisper" }, { name = "ffmpeg-python" }, { name = "filterpy" }, + { name = "google-genai" }, { name = "hydra-core" }, { name = "jinja2" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -2133,12 +2139,14 @@ unitree = [ { name = "faster-whisper" }, { name = "ffmpeg-python" }, { name = "filterpy" }, + { name = "google-genai" }, { name = "gtsam-extended" }, { name = "hydra-core" }, { name = "jinja2" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -2166,12 +2174,14 @@ unitree-dds = [ { name = "faster-whisper" }, { name = "ffmpeg-python" }, { name = "filterpy" }, + { name = "google-genai" }, { name = "gtsam-extended" }, { name = "hydra-core" }, { name = "jinja2" }, { name = "langchain" }, { name = "langchain-chroma" }, { name = "langchain-core" }, + { name = "langchain-google-genai" }, { name = "langchain-huggingface" }, { name = "langchain-ollama" }, { name = "langchain-openai" }, @@ -2384,6 +2394,7 @@ requires-dist = [ { name = "ffmpeg-python", marker = "extra == 'web'" }, { name = "filterpy", marker = "extra == 'perception'", specifier = ">=1.4.5" }, { name = "gdown", marker = "extra == 'misc'", specifier = ">=5.2.2" }, + { name = "google-genai", marker = "extra == 'agents'", specifier = ">=2,<3" }, { name = "googlemaps", marker = "extra == 'misc'", specifier = ">=4.10.0" }, { name = "gtsam-extended", marker = "extra == 'mapping'", specifier = ">=4.3a1.post1" }, { name = "hydra-core", marker = "extra == 'perception'", specifier = ">=1.3.0" }, @@ -2393,6 +2404,7 @@ requires-dist = [ { name = "langchain", marker = "extra == 'agents'", specifier = ">=1.2.3,<2" }, { name = "langchain-chroma", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-core", marker = "extra == 'agents'", specifier = ">=1.2.22,<2" }, + { name = "langchain-google-genai", marker = "extra == 'agents'", specifier = ">=2,<3" }, { name = "langchain-huggingface", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-ollama", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-openai", marker = "extra == 'agents'", specifier = ">=1,<2" }, @@ -3142,6 +3154,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/10/da216e25ef2f3c9dfa75574aa27f5f4c7e5fb5540308f04e4d8c4d834ecb/filelock-3.23.0-py3-none-any.whl", hash = "sha256:4203c3f43983c7c95e4bbb68786f184f6acb7300899bf99d686bb82d526bdf62", size = 22227, upload-time = "2026-02-14T02:53:56.122Z" }, ] +[[package]] +name = "filetype" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020, upload-time = "2022-11-02T17:34:04.141Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, +] + [[package]] name = "filterpy" version = "1.4.5" @@ -3449,6 +3470,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/9b/4366ad3e1c0688146c70aa6143584d6a8d88583b9390f106250e25a3d5cd/glfw-2.10.0-py2.py3-none-win_amd64.whl", hash = "sha256:7f787ee8645781f10e8800438ce4357ab38c573ffb191aba380c1e72eba6311c", size = 559423, upload-time = "2026-03-10T17:21:34.766Z" }, ] +[[package]] +name = "google-ai-generativelanguage" +version = "0.11.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core", extra = ["grpc"] }, + { name = "google-auth" }, + { name = "grpcio" }, + { name = "proto-plus" }, + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/36/9365ca196c4753d0d22951ee0147ecb2124382a010d5dbfb9f3ecd57f2cf/google_ai_generativelanguage-0.11.0.tar.gz", hash = "sha256:d9e24e9836e894a85b52ca03d03530988aeb492d48df71cd1573dc1c3b6d81fc", size = 1539696, upload-time = "2026-03-30T22:51:43.143Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/8c/dc1faf90d7a6ec77e09a6706abefabb41ae5b8d838e5e5b16914da26ee1e/google_ai_generativelanguage-0.11.0-py3-none-any.whl", hash = "sha256:f797f307f0969ae49622e09c1d1a23aa86c7538ffae881279506548166e91d45", size = 1432183, upload-time = "2026-03-30T22:47:56.02Z" }, +] + +[[package]] +name = "google-api-core" +version = "2.30.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "googleapis-common-protos" }, + { name = "proto-plus" }, + { name = "protobuf" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/16/ce/502a57fb0ec752026d24df1280b162294b22a0afb98a326084f9a979138b/google_api_core-2.30.3.tar.gz", hash = "sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b", size = 177001, upload-time = "2026-04-10T00:41:28.035Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/15/e56f351cf6ef1cfea58e6ac226a7318ed1deb2218c4b3cc9bd9e4b786c5a/google_api_core-2.30.3-py3-none-any.whl", hash = "sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8", size = 173274, upload-time = "2026-04-09T22:57:16.198Z" }, +] + +[package.optional-dependencies] +grpc = [ + { name = "grpcio" }, + { name = "grpcio-status" }, +] + +[[package]] +name = "google-auth" +version = "2.53.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, + { name = "pyasn1-modules" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/ad/ff781329bbbdc0974a098d996e89c9e1f7024262f9e3eec442fbb9ad1ac6/google_auth-2.53.0.tar.gz", hash = "sha256:e7e6aa16f6bee7b2b264830fd04f08087a1d5a836df516251a5d15327b246c9c", size = 335844, upload-time = "2026-05-15T20:53:07.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/c9/db44165ba7c581268c6d46017ef63339110378305062830104fc7fa144cb/google_auth-2.53.0-py3-none-any.whl", hash = "sha256:6e7449917c599b35126a99ec268ec6880301f2fea41dce198fe8fd83ff642b68", size = 246071, upload-time = "2026-05-15T20:53:05.609Z" }, +] + +[package.optional-dependencies] +requests = [ + { name = "requests" }, +] + [[package]] name = "google-crc32c" version = "1.8.0" @@ -3484,6 +3561,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/97/7d75fe37a7a6ed171a2cf17117177e7aab7e6e0d115858741b41e9dd4254/google_crc32c-1.8.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f639065ea2042d5c034bf258a9f085eaa7af0cd250667c0635a3118e8f92c69c", size = 28800, upload-time = "2025-12-16T00:40:30.322Z" }, ] +[[package]] +name = "google-genai" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "google-auth", extra = ["requests"] }, + { name = "httpx" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "sniffio" }, + { name = "tenacity" }, + { name = "typing-extensions" }, + { name = "websockets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/91/15fff1b7c22dc0b5b44fa348f151379721353d06a3da22dbbe15bf2c4dd9/google_genai-2.4.0.tar.gz", hash = "sha256:3f8d4bd618be2801e805dc698726731a70f34b438ea25a6c92800eef5b1f513e", size = 552025, upload-time = "2026-05-18T00:25:14.556Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/c8/1b49f6bb69dc7e291ba5d14926937bec4dffcc09ee875822636bd5ef3cf4/google_genai-2.4.0-py3-none-any.whl", hash = "sha256:48df1c44190b05b834fee9cffd360af6d6fd7f68164588e7ebd670fa34f71ee1", size = 818207, upload-time = "2026-05-18T00:25:12.426Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.72.0" @@ -3566,6 +3664,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/b2/b096ccce418882fbfda4f7496f9357aaa9a5af1896a9a7f60d9f2b275a06/grpcio-1.78.0-cp314-cp314-win_amd64.whl", hash = "sha256:dce09d6116df20a96acfdbf85e4866258c3758180e8c49845d6ba8248b6d0bbb", size = 4929852, upload-time = "2026-02-06T09:56:45.885Z" }, ] +[[package]] +name = "grpcio-status" +version = "1.78.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/cd/89ce482a931b543b92cdd9b2888805518c4620e0094409acb8c81dd4610a/grpcio_status-1.78.0.tar.gz", hash = "sha256:a34cfd28101bfea84b5aa0f936b4b423019e9213882907166af6b3bddc59e189", size = 13808, upload-time = "2026-02-06T10:01:48.034Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/8a/1241ec22c41028bddd4a052ae9369267b4475265ad0ce7140974548dc3fa/grpcio_status-1.78.0-py3-none-any.whl", hash = "sha256:b492b693d4bf27b47a6c32590701724f1d3b9444b36491878fb71f6208857f34", size = 14523, upload-time = "2026-02-06T10:01:32.584Z" }, +] + [[package]] name = "gtsam-extended" version = "4.3a1.post1" @@ -4590,6 +4702,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/01/4771b7ab2af1d1aba5b710bd8f13d9225c609425214b357590a17b01be77/langchain_core-1.3.3-py3-none-any.whl", hash = "sha256:18aae8506f37da7f74398492279a7d6efcee4f8e23c4c41c7af080eeb7ef7bd1", size = 543857, upload-time = "2026-05-05T19:02:34.52Z" }, ] +[[package]] +name = "langchain-google-genai" +version = "2.1.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filetype" }, + { name = "google-ai-generativelanguage" }, + { name = "langchain-core" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/38/8b3a71c729bd03e9eb0fd8bdb19e06a074c35bc2eaa61b1b9edfa863f38d/langchain_google_genai-2.1.12.tar.gz", hash = "sha256:4a98371e545eb97fcdf483086a4aebbb8eceeb9597ca5a9c4c35e92f4fbbd271", size = 77566, upload-time = "2025-09-17T01:27:11.747Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/8d/9dd9653e5414e73cae3480e5947bbbbd94ba7fa824efdf46e7ff2c0faef2/langchain_google_genai-2.1.12-py3-none-any.whl", hash = "sha256:4c07630419a8fbe7a2ec512c6dea68289663bfe7d5fae0ba431d2cd59a0d0880", size = 50746, upload-time = "2025-09-17T01:27:10.653Z" }, +] + [[package]] name = "langchain-huggingface" version = "1.2.0" @@ -7637,6 +7764,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, ] +[[package]] +name = "proto-plus" +version = "1.28.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/56/e647b0c675392d2da368da7b6f158f7368b18542fd6f7d7400a2f39de000/proto_plus-1.28.0.tar.gz", hash = "sha256:38e5696342835b08fc116f30a25665b29531cda9d5d5643e9b81fc312385abd9", size = 57221, upload-time = "2026-05-07T08:04:50.811Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/20/b122d4626976acb81132036d2ad1bb35a1a8775fceb837ec30964622516a/proto_plus-1.28.0-py3-none-any.whl", hash = "sha256:a630604310899e73c59ec302e5765c058d412b2f090b9c79c8822589f14955b8", size = 50410, upload-time = "2026-05-07T08:03:31.962Z" }, +] + [[package]] name = "protobuf" version = "6.33.5" @@ -7842,6 +7981,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/9c/47693463894b610f8439b2e970b82ef81e9599c757bf2049365e40ff963c/pyarrow-23.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:427deac1f535830a744a4f04a6ac183a64fcac4341b3f618e693c41b7b98d2b0", size = 28338905, upload-time = "2026-01-18T16:19:32.93Z" }, ] +[[package]] +name = "pyasn1" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, +] + [[package]] name = "pyaudio" version = "0.2.14"