From 8d28679d3c4f7806381967ac3052807b3b713e3c Mon Sep 17 00:00:00 2001 From: Ruthwik Date: Thu, 7 May 2026 19:32:18 -0700 Subject: [PATCH 01/15] feat: camera addition --- dimos/control/tasks/teleop_task.py | 2 +- dimos/teleop/quest/blueprints.py | 2 + dimos/teleop/quest/quest_teleop_module.py | 69 ++++++++- dimos/teleop/quest/web/static/index.html | 168 +++++++++++++++++++++- 4 files changed, 235 insertions(+), 6 deletions(-) diff --git a/dimos/control/tasks/teleop_task.py b/dimos/control/tasks/teleop_task.py index 359e425c57..78163c58e6 100644 --- a/dimos/control/tasks/teleop_task.py +++ b/dimos/control/tasks/teleop_task.py @@ -80,7 +80,7 @@ class TeleopIKTaskConfig: ee_joint_id: int priority: int = 10 timeout: float = 0.5 - max_joint_delta_deg: float = 5.0 # ~500°/s at 100Hz + max_joint_delta_deg: float = 50.0 # ~500°/s at 100Hz hand: Literal["left", "right"] | None = None gripper_joint: str | None = None gripper_open_pos: float = 0.0 diff --git a/dimos/teleop/quest/blueprints.py b/dimos/teleop/quest/blueprints.py index b6fa1257a2..ce0f0d6560 100644 --- a/dimos/teleop/quest/blueprints.py +++ b/dimos/teleop/quest/blueprints.py @@ -27,6 +27,7 @@ from dimos.core.coordination.blueprints import autoconnect from dimos.core.transport import LCMTransport from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.sensor_msgs.Image import Image from dimos.teleop.quest.quest_extensions import ArmTeleopModule from dimos.teleop.quest.quest_types import Buttons from dimos.visualization.rerun.bridge import RerunBridgeModule @@ -68,6 +69,7 @@ "/coordinator/cartesian_command", PoseStamped ), ("buttons", Buttons): LCMTransport("/teleop/buttons", Buttons), + ("color_image", Image): LCMTransport("/teleop/color_image", Image), } ) diff --git a/dimos/teleop/quest/quest_teleop_module.py b/dimos/teleop/quest/quest_teleop_module.py index cd1e5aea42..64f792330c 100644 --- a/dimos/teleop/quest/quest_teleop_module.py +++ b/dimos/teleop/quest/quest_teleop_module.py @@ -21,23 +21,26 @@ deltas, and publishes PoseStamped commands. """ +import asyncio from dataclasses import dataclass from enum import IntEnum from pathlib import Path import threading import time -from typing import Any, TypeVar +from typing import Any, AsyncIterator, TypeVar +import cv2 from dimos_lcm.geometry_msgs import PoseStamped as LCMPoseStamped from dimos_lcm.sensor_msgs import Joy as LCMJoy from fastapi import WebSocket, WebSocketDisconnect -from fastapi.responses import HTMLResponse +from fastapi.responses import HTMLResponse, Response, StreamingResponse from fastapi.staticfiles import StaticFiles from dimos.core.core import rpc from dimos.core.module import Module, ModuleConfig -from dimos.core.stream import Out +from dimos.core.stream import In, Out from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.sensor_msgs.Image import Image from dimos.msgs.sensor_msgs.Joy import Joy from dimos.teleop.quest.quest_types import Buttons, QuestControllerState from dimos.teleop.utils.teleop_transforms import webxr_to_robot @@ -73,6 +76,8 @@ class QuestTeleopConfig(ModuleConfig): control_loop_hz: float = 50.0 server_port: int = 8443 + video_fps: float = 30.0 + video_jpeg_quality: int = 70 _Config = TypeVar("_Config", bound=QuestTeleopConfig) @@ -98,6 +103,10 @@ class QuestTeleopModule(Module): right_controller_output: Out[PoseStamped] buttons: Out[Buttons] + # Optional image input — if connected (e.g. by a sim or RealSense module), + # frames are JPEG-encoded and served at /video for the Quest browser. + color_image: In[Image] + def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) @@ -125,6 +134,10 @@ def __init__(self, **kwargs: Any) -> None: LCMJoy._get_packed_fingerprint(): self._on_joy_bytes, } + # Latest JPEG-encoded camera frame, served by /video. + self._latest_jpeg: bytes | None = None + self._image_lock = threading.Lock() + self._setup_routes() def _setup_routes(self) -> None: @@ -158,11 +171,61 @@ async def websocket_endpoint(ws: WebSocket) -> None: except Exception: logger.exception("WebSocket error") + @self._web_server.app.get("/video") + async def video_feed() -> StreamingResponse: + return StreamingResponse( + self._mjpeg_stream(), + media_type="multipart/x-mixed-replace; boundary=frame", + ) + + @self._web_server.app.get("/snapshot") + async def snapshot() -> Response: + with self._image_lock: + jpeg = self._latest_jpeg + if jpeg is None: + return Response(status_code=204) + return Response(content=jpeg, media_type="image/jpeg") + + async def _mjpeg_stream(self) -> AsyncIterator[bytes]: + """Yield multipart MJPEG frames at video_fps from the latest JPEG cache.""" + period = 1.0 / max(1.0, self.config.video_fps) + while True: + with self._image_lock: + jpeg = self._latest_jpeg + if jpeg is not None: + yield ( + b"--frame\r\n" + b"Content-Type: image/jpeg\r\n" + b"Content-Length: " + str(len(jpeg)).encode() + b"\r\n\r\n" + + jpeg + b"\r\n" + ) + await asyncio.sleep(period) + + def _on_image(self, msg: Image) -> None: + """Encode incoming Image to JPEG and cache for /video.""" + try: + bgr = msg.to_opencv() + ok, buf = cv2.imencode( + ".jpg", bgr, [int(cv2.IMWRITE_JPEG_QUALITY), self.config.video_jpeg_quality] + ) + if not ok: + return + with self._image_lock: + self._latest_jpeg = buf.tobytes() + except Exception: + logger.exception("Failed to encode camera frame") + @rpc def start(self) -> None: super().start() self._start_server() self._start_control_loop() + # Optional: subscribe to camera feed if connected by the blueprint. + try: + self.color_image.subscribe(self._on_image) + logger.info("Quest teleop: camera feed subscribed → /video") + except Exception: + logger.info("Quest teleop: no camera connected, /video will be empty") logger.info("Quest Teleoperation Module started") @rpc diff --git a/dimos/teleop/quest/web/static/index.html b/dimos/teleop/quest/web/static/index.html index 8d31289f88..e8299cc0e0 100644 --- a/dimos/teleop/quest/web/static/index.html +++ b/dimos/teleop/quest/web/static/index.html @@ -108,6 +108,9 @@

DimOS Quest-3 Teleop

+ + + diff --git a/dimos/teleop/quest/web/static/teleop.css b/dimos/teleop/quest/web/static/teleop.css new file mode 100644 index 0000000000..701dd04ded --- /dev/null +++ b/dimos/teleop/quest/web/static/teleop.css @@ -0,0 +1,93 @@ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +body { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + background-color: #fff; + color: #333; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + height: 100vh; + overflow: hidden; +} + +#ui { + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 100%; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + z-index: 100; +} + +h1 { + font-size: 42px; + font-weight: 600; + margin-bottom: 40px; + color: #28283b; +} + +#status { + font-size: 28px; + font-weight: 500; + color: #0077b6; + padding: 16px 32px; + background-color: #fff; + border: 2px solid #0077b6; + border-radius: 12px; + margin-bottom: 24px; + min-width: 300px; + text-align: center; +} + +button { + padding: 20px 48px; + font-size: 24px; + font-weight: 600; + color: white; + background-color: #0077b6; + border: none; + border-radius: 12px; + cursor: pointer; + transition: background-color 0.2s; +} + +button:hover { + background-color: #005f8a; +} + +button:disabled { + background-color: #adb5bd; + cursor: not-allowed; +} + +button.disconnect { + background-color: #6c757d; + margin-top: 16px; +} + +button.disconnect:hover { + background-color: #545b62; +} + +#canvas { + display: none; + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 100%; +} + +.hidden { + display: none !important; +} diff --git a/dimos/teleop/quest/web/static/teleop.js b/dimos/teleop/quest/web/static/teleop.js new file mode 100644 index 0000000000..f931f9fdd7 --- /dev/null +++ b/dimos/teleop/quest/web/static/teleop.js @@ -0,0 +1,451 @@ +// Global error handler +window.onerror = (msg, url, line, col, error) => { + console.error(`[ERROR] ${msg} at ${url}:${line}:${col}`, error); + document.getElementById('status').textContent = `Error: ${msg}`; +}; + +import { geometry_msgs, std_msgs, sensor_msgs } from "https://esm.sh/jsr/@dimos/msgs@0.1.4"; + +// WebSocket and VR state +let ws = null; +let xrSession = null; +let xrRefSpace = null; +let gl = null; +let lastSendTime = 0; +const sendInterval = 1000 / 80; // ~80Hz target + +// Video panel state +const videoEl = document.getElementById('videoFeed'); +let videoTex = null; +let videoProgram = null; +let videoVbo = null; +let videoAttribs = null; +let videoUniforms = null; +let videoReady = false; // true after first frame loads +let videoDirty = false; // true when a new JPEG has finished decoding +let videoAspect = 1.0; // cached at load time — see videoEl.onload +let prevBlobUrl = null; // revoked when the next-next blob arrives +const videoModelMatrix = new Float32Array(16); + +const PANEL_POS_X = 0.0; +const PANEL_POS_Y = 1.4; // ~eye height +const PANEL_POS_Z = -1.5; // 1.5m in front of starting position +const PANEL_HEIGHT = 0.9; + +// UI elements +const statusEl = document.getElementById('status'); +const connectBtn = document.getElementById('connectBtn'); +const disconnectBtn = document.getElementById('disconnectBtn'); +const canvas = document.getElementById('canvas'); + +function setStatus(msg) { + statusEl.textContent = msg; +} + +// WebSocket setup (LCM bridge) +function setupWebSocket() { + return new Promise((resolve, reject) => { + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; + const wsUrl = `${protocol}//${window.location.host}/ws`; + + setStatus('Connecting to server...'); + ws = new WebSocket(wsUrl); + ws.binaryType = 'blob'; + + ws.onopen = () => { + setStatus('Server connected'); + resolve(); + }; + ws.onerror = (error) => { + setStatus('WebSocket error'); + console.error('WebSocket error:', error); + reject(error); + }; + ws.onclose = () => { + setStatus('WebSocket closed'); + }; + // Defer revoking the previous blob URL by one message — revoking + // immediately after setting src can race with the browser's load + // on some engines, briefly dropping naturalWidth to 0. + ws.onmessage = (e) => { + if (!(e.data instanceof Blob)) return; + const newUrl = URL.createObjectURL(e.data); + if (prevBlobUrl) URL.revokeObjectURL(prevBlobUrl); + prevBlobUrl = videoEl.src.startsWith('blob:') ? videoEl.src : null; + videoEl.src = newUrl; + }; + }); +} + +// Initialize WebGL +function initGL() { + gl = canvas.getContext('webgl', { + xrCompatible: true, + alpha: true + }); + if (!gl) { + throw new Error('WebGL not supported'); + } + gl.clearColor(0, 0, 0, 0); // Transparent background for passthrough + initVideoPanel(); +} + +// Compile + link a textured-quad pipeline that renders the camera +// feed as a world-locked panel inside the WebXR scene. +function initVideoPanel() { + const vsSrc = ` + attribute vec2 a_pos; + attribute vec2 a_uv; + uniform mat4 u_proj; + uniform mat4 u_view; + uniform mat4 u_model; + varying vec2 v_uv; + void main() { + gl_Position = u_proj * u_view * u_model + * vec4(a_pos.x, a_pos.y, 0.0, 1.0); + v_uv = a_uv; + }`; + const fsSrc = ` + precision mediump float; + varying vec2 v_uv; + uniform sampler2D u_tex; + void main() { + gl_FragColor = texture2D(u_tex, v_uv); + }`; + + const compile = (type, src) => { + const sh = gl.createShader(type); + gl.shaderSource(sh, src); + gl.compileShader(sh); + if (!gl.getShaderParameter(sh, gl.COMPILE_STATUS)) { + throw new Error('Shader compile failed: ' + gl.getShaderInfoLog(sh)); + } + return sh; + }; + const vs = compile(gl.VERTEX_SHADER, vsSrc); + const fs = compile(gl.FRAGMENT_SHADER, fsSrc); + videoProgram = gl.createProgram(); + gl.attachShader(videoProgram, vs); + gl.attachShader(videoProgram, fs); + gl.linkProgram(videoProgram); + if (!gl.getProgramParameter(videoProgram, gl.LINK_STATUS)) { + throw new Error('Program link failed: ' + gl.getProgramInfoLog(videoProgram)); + } + + // Quad as TRIANGLE_STRIP: x, y, u, v. v=0 at top of quad + + // UNPACK_FLIP_Y_WEBGL=false → image displays upright. + const verts = new Float32Array([ + -1, -1, 0, 1, + 1, -1, 1, 1, + -1, 1, 0, 0, + 1, 1, 1, 0, + ]); + videoVbo = gl.createBuffer(); + gl.bindBuffer(gl.ARRAY_BUFFER, videoVbo); + gl.bufferData(gl.ARRAY_BUFFER, verts, gl.STATIC_DRAW); + + videoAttribs = { + pos: gl.getAttribLocation(videoProgram, 'a_pos'), + uv: gl.getAttribLocation(videoProgram, 'a_uv'), + }; + videoUniforms = { + proj: gl.getUniformLocation(videoProgram, 'u_proj'), + view: gl.getUniformLocation(videoProgram, 'u_view'), + model: gl.getUniformLocation(videoProgram, 'u_model'), + tex: gl.getUniformLocation(videoProgram, 'u_tex'), + }; + + videoTex = gl.createTexture(); + gl.bindTexture(gl.TEXTURE_2D, videoTex); + gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE); + gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE); + gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.LINEAR); + gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.LINEAR); + gl.pixelStorei(gl.UNPACK_FLIP_Y_WEBGL, false); + + // Cache aspect here — naturalWidth can transiently drop to 0 + // between loads, which would collapse the panel to 1:1. + videoEl.onload = () => { + videoReady = true; + videoDirty = true; + if (videoEl.naturalHeight) { + videoAspect = videoEl.naturalWidth / videoEl.naturalHeight; + } + }; +} + +function uploadVideoTexture() { + gl.bindTexture(gl.TEXTURE_2D, videoTex); + gl.texImage2D( + gl.TEXTURE_2D, 0, gl.RGB, gl.RGB, gl.UNSIGNED_BYTE, videoEl + ); +} + +function updateModelMatrix() { + const halfH = PANEL_HEIGHT * 0.5; + const halfW = halfH * videoAspect; + videoModelMatrix.fill(0); + videoModelMatrix[0] = halfW; + videoModelMatrix[5] = halfH; + videoModelMatrix[10] = 1; + videoModelMatrix[12] = PANEL_POS_X; + videoModelMatrix[13] = PANEL_POS_Y; + videoModelMatrix[14] = PANEL_POS_Z; + videoModelMatrix[15] = 1; +} + +function renderVideoPanel(view, viewport) { + gl.viewport(viewport.x, viewport.y, viewport.width, viewport.height); + gl.useProgram(videoProgram); + + gl.bindBuffer(gl.ARRAY_BUFFER, videoVbo); + gl.enableVertexAttribArray(videoAttribs.pos); + gl.vertexAttribPointer(videoAttribs.pos, 2, gl.FLOAT, false, 16, 0); + gl.enableVertexAttribArray(videoAttribs.uv); + gl.vertexAttribPointer(videoAttribs.uv, 2, gl.FLOAT, false, 16, 8); + + gl.uniformMatrix4fv(videoUniforms.proj, false, view.projectionMatrix); + gl.uniformMatrix4fv(videoUniforms.view, false, view.transform.inverse.matrix); + gl.uniformMatrix4fv(videoUniforms.model, false, videoModelMatrix); + + gl.activeTexture(gl.TEXTURE0); + gl.bindTexture(gl.TEXTURE_2D, videoTex); + gl.uniform1i(videoUniforms.tex, 0); + + gl.drawArrays(gl.TRIANGLE_STRIP, 0, 4); +} + + +// Send raw controller tracking data (no processing - done in Python) +function processTracking(frame) { + // Rate limit tracking data + const now = performance.now(); + if (now - lastSendTime < sendInterval) { + return; + } + lastSendTime = now; + + // Process input sources (controllers) + for (const inputSource of frame.session.inputSources) { + const trackingSpace = inputSource.gripSpace || inputSource.targetRaySpace; + if (!trackingSpace) continue; + + const handedness = inputSource.handedness; + if (handedness !== 'left' && handedness !== 'right') continue; + + const pose = frame.getPose(trackingSpace, xrRefSpace); + if (!pose) continue; + + // Send raw pose directly from WebXR - no processing + const pos = pose.transform.position; + const rot = pose.transform.orientation; + + const nowMs = Date.now(); + const poseStamped = new geometry_msgs.PoseStamped({ + header: new std_msgs.Header({ + stamp: new std_msgs.Time({ sec: Math.floor(nowMs / 1000), nsec: (nowMs % 1000) * 1_000_000 }), + frame_id: handedness + }), + pose: new geometry_msgs.Pose({ + position: new geometry_msgs.Point({ x: pos.x, y: pos.y, z: pos.z }), + orientation: new geometry_msgs.Quaternion({ x: rot.x, y: rot.y, z: rot.z, w: rot.w }) + }) + }); + + if (ws && ws.readyState === WebSocket.OPEN) { + ws.send(poseStamped.encode()); + } + + // Send Joy message with all buttons and axes + const gamepad = inputSource.gamepad; + if (gamepad) { + const isXrStandard = gamepad.mapping === 'xr-standard'; + const stickX = (isXrStandard ? gamepad.axes[2] : gamepad.axes[0]) ?? 0.0; + const stickY = (isXrStandard ? gamepad.axes[3] : gamepad.axes[1]) ?? 0.0; + const axes = [ + stickX, + stickY, + gamepad.buttons[0]?.value ?? 0.0, + gamepad.buttons[1]?.value ?? 0.0, + ]; + + // Buttons layout (int32, 0 or 1): + // [0] = trigger (digital) + // [1] = grip (digital) + // [2] = touchpad press + // [3] = thumbstick press + // [4] = X/A button + // [5] = Y/B button + // [6] = menu (if exposed) + const buttons = []; + for (let i = 0; i < gamepad.buttons.length; i++) { + buttons.push(gamepad.buttons[i]?.pressed ? 1 : 0); + } + + const joyMsg = new sensor_msgs.Joy({ + header: new std_msgs.Header({ + stamp: new std_msgs.Time({ sec: Math.floor(nowMs / 1000), nsec: (nowMs % 1000) * 1_000_000 }), + frame_id: handedness + }), + axes_length: axes.length, + buttons_length: buttons.length, + axes: axes, + buttons: buttons + }); + + if (ws && ws.readyState === WebSocket.OPEN) { + ws.send(joyMsg.encode()); + } + } + } +} + +// VR render loop +function onXRFrame(_time, frame) { + if (!xrSession) return; + xrSession.requestAnimationFrame(onXRFrame); + // Process and send tracking data + processTracking(frame); + + const glLayer = xrSession.renderState.baseLayer; + gl.bindFramebuffer(gl.FRAMEBUFFER, glLayer.framebuffer); + gl.clear(gl.COLOR_BUFFER_BIT | gl.DEPTH_BUFFER_BIT); + + if (!videoReady) return; + // Keep rendering the existing texture between loads to avoid blinking. + if (videoDirty && videoEl.naturalWidth) { + uploadVideoTexture(); + videoDirty = false; + } + updateModelMatrix(); + const pose = frame.getViewerPose(xrRefSpace); + if (pose) { + for (const view of pose.views) { + renderVideoPanel(view, glLayer.getViewport(view)); + } + } +} + +// Start VR session with passthrough +async function startVR() { + try { + setStatus('Initializing WebGL...'); + initGL(); + setStatus('Requesting VR session...'); + + // Try immersive-ar first (true passthrough), fall back to immersive-vr + let session = null; + try { + session = await navigator.xr.requestSession('immersive-ar', { + requiredFeatures: ['local-floor'], + optionalFeatures: ['hand-tracking'] + }); + console.log('Started immersive-ar session (passthrough)'); + } catch (arError) { + console.log('immersive-ar not available, trying immersive-vr'); + session = await navigator.xr.requestSession('immersive-vr', { + requiredFeatures: ['local-floor'], + optionalFeatures: ['hand-tracking'] + }); + console.log('Started immersive-vr session'); + } + + xrSession = session; + + // Setup WebGL layer + const glLayer = new XRWebGLLayer(session, gl); + await session.updateRenderState({ + baseLayer: glLayer + }); + + // Get reference space + xrRefSpace = await session.requestReferenceSpace('local-floor'); + + setStatus('VR active'); + + // Session event handlers + session.addEventListener('end', () => { + setStatus('VR session ended'); + xrSession = null; + window.disconnect(); + }); + + // Start render loop + session.requestAnimationFrame(onXRFrame); + + } catch (error) { + setStatus('VR failed: ' + error.message); + console.error('VR session error:', error); + throw error; + } +} + +// Connect button handler +window.connect = async function() { + try { + connectBtn.disabled = true; + + // Check WebXR support + if (!navigator.xr) { + throw new Error('WebXR not supported. Use Quest 3 browser.'); + } + + // Setup WebSocket + await setupWebSocket(); + + // Start VR + await startVR(); + + // Update UI + connectBtn.classList.add('hidden'); + disconnectBtn.classList.remove('hidden'); + + } catch (error) { + setStatus('Connection failed'); + console.error('Connection error:', error); + connectBtn.disabled = false; + } +}; + +// Disconnect button handler +window.disconnect = async function() { + setStatus('Disconnecting...'); + + if (xrSession) { + await xrSession.end().catch(console.error); + xrSession = null; + } + + if (ws) { + ws.close(); + ws = null; + } + + // Update UI + connectBtn.classList.remove('hidden'); + connectBtn.disabled = false; + disconnectBtn.classList.add('hidden'); + setStatus('Disconnected'); +}; + +// Check WebXR availability on load +window.addEventListener('load', async () => { + if (!navigator.xr) { + setStatus('WebXR not available'); + connectBtn.disabled = true; + return; + } + + try { + // Check for AR (passthrough) or VR support + const arSupported = await navigator.xr.isSessionSupported('immersive-ar').catch(() => false); + const vrSupported = await navigator.xr.isSessionSupported('immersive-vr').catch(() => false); + + if (!arSupported && !vrSupported) { + setStatus('VR/AR not supported'); + connectBtn.disabled = true; + } + } catch (error) { + console.error('WebXR check failed:', error); + } +});