Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b80e3c1
Forward SHM transports to Rerun and unify Go2 replay IPC
bogwi May 24, 2026
c48366d
fix: mypy
bogwi May 25, 2026
928c08f
fix: Greptile P1
bogwi May 25, 2026
eb622d6
feat: add gemini/local speak skills, agentic gemini Go2 blueprint, an…
grmkris May 26, 2026
d60d163
chore: use stock go2-memory recorder, drop Go2FullRecorder
grmkris May 26, 2026
faea9bd
feat(go2): take_picture skill — capture frame, upload to robomoo
grmkris May 26, 2026
05d2175
feat(go2): pose on captures + global_costmap → robomoo map uploader
grmkris May 26, 2026
ff023ca
feat(go2): all-Gemini VL + explore_and_capture + map uploader
grmkris May 27, 2026
0b0286d
feat(go2): all-Gemini VL (no local Moondream) for Mac-only agentic bl…
grmkris May 27, 2026
0d3cc48
feat(speak): non-blocking by default + fix per-call TTS thread leak
grmkris May 27, 2026
b45399c
feat(take_picture): fire-and-forget upload (return immediately)
grmkris May 27, 2026
f92dd89
feat(go2): tilt_body skill — aim camera up/down via Euler body pitch
grmkris May 27, 2026
30a09bc
feat(follow_person): local tracker between Gemini re-detections
grmkris May 27, 2026
e5b307a
feat(take_picture): tilt_and_capture — aim camera, photograph, re-lev…
grmkris May 27, 2026
e120a32
feat(go2): value-encoded occupancy upload + recording→robomoo exporter
grmkris May 27, 2026
fd3254c
chore: add go2-start.sh hackathon quickstart
tfius May 27, 2026
3af952e
feat(web): MJPEG + snapshot bridge for color_image
tfius May 27, 2026
86421bc
feat(unitree): Go2 audio I/O and audio-ws web bridge
tfius May 27, 2026
22ef9a1
docs(journal): camera + audio web bridges
tfius May 27, 2026
f1266de
chore(go2-start): launch camera + audio bridges by default; add SIMUL…
tfius May 27, 2026
66a6b51
feat(go2-start): LM Studio + mlxvlm Gemma-4 local LLM presets + sim w…
tfius May 27, 2026
97c3c04
fix(sim-with-llm): default to Mac-friendly unitree-go2-agentic-gemini
tfius May 27, 2026
3b83498
fix(sim-with-llm): avoid google-genai import; minimal Mac+LM-Studio c…
tfius May 27, 2026
2773213
fix(sim-with-llm): drop unitree-skill-container from minimal default
tfius May 27, 2026
d62128d
docs(journal): Mac + local-LLM agentic landscape
tfius May 27, 2026
c3dee5a
fix(audio-ws): use Server.run() + queue handoff; surface bind errors
tfius May 27, 2026
b626d28
feat(web): cmd-bridge-module — HTTP cmd_vel/path/pose bridge + mlxvlm…
tfius May 27, 2026
5155188
chore(go2-start): include cmd-bridge-module in default bridges
tfius May 27, 2026
76575ef
fix(go2-start): auto-inject mcp-server/mcp-client for real-robot LLM …
tfius May 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,8 @@ htmlcov/

# Memory2 autorecord
recording*.db
recording*.db-wal
recording*.db-shm

# MuJoCo runtime log
MUJOCO_LOG.TXT
145 changes: 145 additions & 0 deletions dimos/agents/skills/gemini_speak_skill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# Copyright 2025-2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""A speak skill backed by Google's Gemini TTS API.

Drop-in replacement for ``SpeakSkill`` that reuses ``GOOGLE_API_KEY`` (already
used for the LLM and embeddings) instead of an OpenAI key, and works on any
platform with an audio output device. Satisfies ``SpeakSkillSpec``
(``speak(text, blocking=True) -> str``).
"""

import threading
import time

from reactivex import Subject

from dimos.agents.annotation import skill
from dimos.core.core import rpc
from dimos.core.module import Module, ModuleConfig
from dimos.stream.audio.node_output import SounddeviceAudioOutput
from dimos.stream.audio.tts.node_gemini import GeminiTTSNode
from dimos.utils.logging_config import setup_logger

logger = setup_logger()


class GeminiSpeakSkillConfig(ModuleConfig):
# Prebuilt Gemini voice name (e.g. "Kore", "Puck", "Charon", "Aoede").
voice: str = "Kore"
# Gemini TTS model; must be a `*-preview-tts` model to emit audio.
model: str = "gemini-2.5-flash-preview-tts"


class GeminiSpeakSkill(Module):
config: GeminiSpeakSkillConfig
_tts_node: GeminiTTSNode | None = None
_audio_output: SounddeviceAudioOutput | None = None
_audio_lock: threading.Lock = threading.Lock()
_text_subject: "Subject[str] | None" = None

@rpc
def start(self) -> None:
super().start()
self._tts_node = GeminiTTSNode(voice=self.config.voice, model=self.config.model)
self._audio_output = SounddeviceAudioOutput(sample_rate=24000)
self._audio_output.consume_audio(self._tts_node.emit_audio())
# Wire the text pipeline ONCE. Each speak() just pushes onto this subject;
# the TTS node's own worker drains it FIFO. (Previously consume_text was
# called per speak(), spawning a fresh worker thread + subscription every
# call — a leak, and the source of the repeated "Starting GeminiTTSNode".)
self._text_subject = Subject()
self._tts_node.consume_text(self._text_subject)

@rpc
def stop(self) -> None:
if self._tts_node:
# dispose() clears the queue and joins the worker, so in-flight/queued
# speech is torn down here — no separate bg-thread bookkeeping needed.
self._tts_node.dispose()
self._tts_node = None
if self._audio_output:
self._audio_output.stop()
self._audio_output = None
self._text_subject = None
super().stop()

@skill
def speak(self, text: str, blocking: bool = False) -> str:
"""Speak text out loud through the robot's speakers.

USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak.

Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly.

Returns immediately by default (the audio plays in the background); pass
``blocking=True`` only when you must wait until the utterance finishes.

Example usage:

speak("Hello, I am your robot assistant.")
"""
if self._tts_node is None or self._text_subject is None:
return "Error: TTS not initialized"

if not blocking:
# Fire-and-forget: enqueue on the shared pipeline and return now.
self._text_subject.on_next(text)
return f"Speaking (non-blocking): {text}"

return self._speak_blocking(text)

def _speak_blocking(self, text: str) -> str:
# Serialize blocking speech so utterances don't overlap on the speaker.
with self._audio_lock:
if self._tts_node is None or self._text_subject is None:
return "Error: TTS not initialized"

audio_complete = threading.Event()

# emit_text() re-emits the exact utterance once its synthesis finishes;
# match on the text so a concurrent non-blocking speak can't trip us.
def on_text(t: str) -> None:
if t == text:
audio_complete.set()

def on_error(_e: Exception) -> None:
audio_complete.set()

subscription = self._tts_node.emit_text().subscribe(
on_next=on_text,
on_error=on_error,
)

self._text_subject.on_next(text)

# Gemini synthesis is a network round-trip; allow more headroom than
# the local-output time so first-token latency doesn't trip the wait.
timeout = max(15, len(text) * 0.1)
try:
if not audio_complete.wait(timeout=timeout):
logger.warning(f"TTS timeout reached for: {text}")
return f"Warning: TTS timeout while speaking: {text}"
# Small delay to ensure buffers flush.
time.sleep(0.3)
return f"Spoke: {text}"
finally:
subscription.dispose()


if __name__ == "__main__":
skill_module = GeminiSpeakSkill()
skill_module.start()
print(skill_module.speak("Hello, I am your robot assistant, powered by Gemini.", blocking=True))
skill_module.stop()
122 changes: 122 additions & 0 deletions dimos/agents/skills/local_speak_skill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Copyright 2025-2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""A speak skill backed by the macOS ``say`` command.

Drop-in replacement for ``SpeakSkill`` that requires no OpenAI key and no
audio pipeline: ``say`` synthesizes and plays the audio itself. macOS only.
Satisfies ``SpeakSkillSpec`` (``speak(text, blocking=True) -> str``).
"""

import shutil
import subprocess
import threading

from dimos.agents.annotation import skill
from dimos.constants import DEFAULT_THREAD_JOIN_TIMEOUT
from dimos.core.core import rpc
from dimos.core.module import Module, ModuleConfig
from dimos.utils.logging_config import setup_logger

logger = setup_logger()


class LocalSpeakSkillConfig(ModuleConfig):
# macOS `say` voice name (e.g. "Daniel", "Samantha"); None = system default.
voice: str | None = None
# Speech rate in words per minute; None = `say` default (~175).
rate: int | None = None


class LocalSpeakSkill(Module):
"""Speak text out loud through the local macOS ``say`` command."""

config: LocalSpeakSkillConfig
_bg_threads: list[threading.Thread] = []
_bg_threads_lock: threading.Lock = threading.Lock()

@rpc
def start(self) -> None:
super().start()
if shutil.which("say") is None:
logger.warning(
"LocalSpeakSkill: `say` not found on PATH; speak() will no-op. "
"This skill is macOS-only."
)

@rpc
def stop(self) -> None:
with self._bg_threads_lock:
threads = list(self._bg_threads)
for t in threads:
t.join(timeout=DEFAULT_THREAD_JOIN_TIMEOUT)
super().stop()

def _command(self, text: str) -> list[str]:
cmd = ["say"]
if self.config.voice:
cmd += ["-v", self.config.voice]
if self.config.rate:
cmd += ["-r", str(self.config.rate)]
cmd.append(text)
return cmd

@skill
def speak(self, text: str, blocking: bool = True) -> str:
"""Speak text out loud through the robot's speakers.

USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak.

Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly.

Example usage:

speak("Hello, I am your robot assistant.")
"""
if shutil.which("say") is None:
return "Error: `say` command not available (LocalSpeakSkill is macOS-only)"

if not text.strip():
return "Error: nothing to speak"

if not blocking:
thread = threading.Thread(
target=self._speak_blocking, args=(text,), daemon=True, name="LocalSpeakSkill-bg"
)
with self._bg_threads_lock:
self._bg_threads.append(thread)
thread.start()
return f"Speaking (non-blocking): {text}"

return self._speak_blocking(text)

def _speak_blocking(self, text: str) -> str:
try:
subprocess.run(self._command(text), check=True)
except subprocess.CalledProcessError as e:
logger.error(f"`say` failed: {e}")
return f"Error: failed to speak: {text}"
finally:
with self._bg_threads_lock:
self._bg_threads = [
t for t in self._bg_threads if t is not threading.current_thread()
]
return f"Spoke: {text}"


if __name__ == "__main__":
skill_module = LocalSpeakSkill()
skill_module.start()
print(skill_module.speak("Hello, I am your robot assistant, powered by Gemini."))
skill_module.stop()
91 changes: 91 additions & 0 deletions dimos/agents/skills/map_uploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright 2025-2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Periodically upload the 2D occupancy map (global_costmap) to the robomoo app.

Subscribes to `global_costmap`, encodes it as a *value-preserving* grayscale PNG
(free=0, occupied=1..100, unknown=255 — NOT a pre-colored image), throttles, and
POSTs it plus grid metadata (resolution, origin, width, height) to robomoo's
`/api/robot/map`. The web app reads the raw cell values back and applies its own
colormap + overlays, mapping world→pixel via
`col = (x - originX) / resolution`, `row = (y - originY) / resolution`.

Env: ROBOMOO_URL, ROBOT_INGEST_TOKEN.
"""

import os
import time

import cv2
import numpy as np
import httpx

from dimos.core.core import rpc
from dimos.core.module import Module, ModuleConfig
from dimos.core.stream import In
from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid
from dimos.utils.logging_config import setup_logger

logger = setup_logger()


class MapUploaderConfig(ModuleConfig):
robomoo_url: str = os.getenv("ROBOMOO_URL", "")
ingest_token: str = os.getenv("ROBOT_INGEST_TOKEN", "")
min_period_s: float = 5.0 # throttle: at most one upload every N seconds


class MapUploader(Module):
config: MapUploaderConfig
global_costmap: In[OccupancyGrid]

@rpc
def start(self) -> None:
super().start()
self._last = 0.0
self.global_costmap.subscribe(self._on_costmap)

def _on_costmap(self, grid: OccupancyGrid) -> None:
now = time.monotonic()
if now - self._last < self.config.min_period_s:
return
url = self.config.robomoo_url
token = self.config.ingest_token
if not url or not token:
return
self._last = now

try:
# Value-preserving encoding: free/occupied 0..100 stay as-is, unknown
# (-1) → 255. The web recolors from these raw values, so we never bake
# a colormap into the upload. (H, W) uint8 → grayscale PNG.
enc = np.where(grid.grid == -1, 255, np.clip(grid.grid, 0, 100)).astype(np.uint8)
ok, buf = cv2.imencode(".png", enc)
if not ok:
return
httpx.post(
f"{url.rstrip('/')}/api/robot/map",
headers={"Authorization": f"Bearer {token}"},
files={"file": ("map.png", buf.tobytes(), "image/png")},
data={
"resolution": str(grid.resolution),
"originX": str(grid.origin.position.x),
"originY": str(grid.origin.position.y),
"width": str(grid.width),
"height": str(grid.height),
},
timeout=30.0,
).raise_for_status()
except Exception as e: # noqa: BLE001 — best-effort; never break the stream
logger.warning("map upload failed: %s", e)
Loading