diff --git a/Auto_Use/macOS_use/agent/main_driver/service.py b/Auto_Use/macOS_use/agent/main_driver/service.py index 832ed7b..8f61816 100644 --- a/Auto_Use/macOS_use/agent/main_driver/service.py +++ b/Auto_Use/macOS_use/agent/main_driver/service.py @@ -689,7 +689,7 @@ def process_request(self, task: str) -> str: # Note: messages_for_api has previous responses with Step 1 replaced if applicable # normalized_json is current response (with thinking) self._save_conversation(messages_for_api, user_message, normalized_json, image_sent, step_number) - + # Remove thinking from response before adding to memory (saves tokens) normalized_json_without_thinking = self._remove_thinking_from_response(normalized_json) diff --git a/Auto_Use/macOS_use/remote_connection/telegram/banner.py b/Auto_Use/macOS_use/remote_connection/telegram/banner.py index 0d8e9b9..870c69b 100644 --- a/Auto_Use/macOS_use/remote_connection/telegram/banner.py +++ b/Auto_Use/macOS_use/remote_connection/telegram/banner.py @@ -291,18 +291,53 @@ """ -# Compact HTML — used when StatusBanner(compact=True). Just the orb in a tiny -# circular pill, no message span, no Next button, no JS message handlers. The -# centred PC monitor icon cross-fades with a Telegram paper-plane every ~5s -# so the user can tell at a glance this is a Telegram-triggered task. +# Compact HTML — used when StatusBanner(compact=True). +# +# Visual model: +# Empty state (no task running): +# ┌──┐ +# │○ │ 44×44 white circle, just the orb. +# └──┘ +# +# Text streaming (during a task): +# ┌────────────────────────────────────────┐ +# │○ single-line text streams to the right│ +# └────────────────────────────────────────┘ +# 440 px +# +# Pre-expand width, then stream (no jitter): +# - body has fixed empty width (44) and fixed has-text width (440); the +# height stays at 44 in both states. On first setMsg(), JS toggles +# body.has-text, body width snaps 44→440, ResizeObserver fires once, +# Python animates the NSPanel over ~0.25 s. JS then waits 350 ms +# before appending the first word, so streaming starts AFTER the +# banner finishes expanding. +# +# Paging through long messages: +# - .msg is `white-space: nowrap; overflow: hidden` and capped at +# max-width: 388 px. After each word, JS does a sync layout read to +# check if scrollWidth has exceeded the visible width. If yes, the +# word that overflowed is removed, we hold the current line briefly, +# then clear and continue streaming the remaining words on a fresh +# line. Loops until every word has been displayed. COMPACT_HTML = """
@@ -605,6 +633,105 @@ def _apply_rounded_region(title: str) -> None:
+ + """ @@ -624,13 +751,25 @@ class _BannerState: Deliberately NOT used as `js_api` — see _make_js_handlers.""" - def __init__(self, title: str, width: int, min_h: int, compact: bool): + def __init__(self, title: str, width: int, min_h: int, compact: bool, + screen_w: int = 1920, top_margin: int = SCREEN_MARGIN, + right_margin: int = SCREEN_MARGIN): self.window = None self.title = title self.width = width self.min_h = min_h self.compact = compact self.last_h = min_h + # Last reported width — only relevant for compact mode where both + # axes grow. Standard mode keeps a fixed width so this stays at + # init value. + self.last_w = width + # Screen geometry the resize handler uses to anchor the pill's + # top-right corner. Without this the pill would drift leftward + # across the screen as it grew wider. + self.screen_w = screen_w + self.top_margin = top_margin + self.right_margin = right_margin def _make_js_handlers(state: _BannerState): @@ -661,8 +800,9 @@ def save_clicked(value=None): def height_changed(h=0): """Resize the window to fit the reported body height, then re-clip the (possibly taller) window into a stadium so the end - caps follow the new height. No-op for the compact pill which - has no scrollable content and a constant 80×80 size.""" + caps follow the new height. Used by the STANDARD banner only — + the compact pill posts {w, h} to size_changed instead, which + animates both axes.""" if state.compact or state.window is None: return None try: @@ -679,7 +819,41 @@ def height_changed(h=0): pass return None - return next_clicked, choice_clicked, save_clicked, height_changed + def size_changed(w=0, h=0): + """Resize the compact pill in both axes to fit its natural body + size, then re-position so the top-right corner stays anchored to + its screen position (without this, growing wider would push the + pill leftward across the screen). Compact mode only — the + standard banner has a fixed width and uses height_changed.""" + if not state.compact or state.window is None: + return None + try: + new_w = max(COMPACT_MIN_W, min(COMPACT_MAX_W, int(w))) + new_h = max(COMPACT_MIN_H, min(COMPACT_MAX_H, int(h))) + if new_w == state.last_w and new_h == state.last_h: + return None + state.last_w = new_w + state.last_h = new_h + # window.move BEFORE resize: when we shrink the pill, resizing + # first leaves a brief 1-frame gap on the right; moving first + # closes that gap. Both APIs schedule on the GUI thread so the + # ordering is honoured by WinForms. + new_x = max(0, state.screen_w - new_w - state.right_margin) + new_y = state.top_margin + try: + state.window.move(new_x, new_y) + except Exception: + pass + state.window.resize(new_w, new_h) + # Region clip is sized in absolute pixels — recompute for the + # new dimensions or the pill renders with hard rectangle + # corners on its excess area. + _apply_rounded_region(state.title) + except Exception: + pass + return None + + return next_clicked, choice_clicked, save_clicked, height_changed, size_changed # ── stdin reader thread (subprocess-side only) ─────────────────────────── @@ -781,16 +955,19 @@ def _run_subprocess_banner() -> None: except Exception: screen_w = 1920 - w = COMPACT_SIZE if compact else PILL_WIDTH - h = COMPACT_SIZE if compact else PILL_HEIGHT + w = COMPACT_MIN_W if compact else PILL_WIDTH + h = COMPACT_MIN_H if compact else PILL_HEIGHT x = max(0, screen_w - w - SCREEN_MARGIN) y = SCREEN_MARGIN html = COMPACT_HTML if compact else BANNER_HTML title = f"AutoUseBanner_{uuid.uuid4().hex[:8]}" - state = _BannerState(title=title, width=w, min_h=h, compact=compact) - next_clicked, choice_clicked, save_clicked, height_changed = ( - _make_js_handlers(state) + state = _BannerState( + title=title, width=w, min_h=h, compact=compact, + screen_w=screen_w, top_margin=SCREEN_MARGIN, + right_margin=SCREEN_MARGIN, ) + (next_clicked, choice_clicked, save_clicked, + height_changed, size_changed) = _make_js_handlers(state) # No js_api here — methods on a Nuitka-compiled class fail pywebview's # `inspect.ismethod` filter and never get exposed to JS. We register @@ -809,30 +986,41 @@ def _run_subprocess_banner() -> None: resizable=False, ) state.window = window - window.expose(next_clicked, choice_clicked, save_clicked, height_changed) + window.expose(next_clicked, choice_clicked, save_clicked, + height_changed, size_changed) _log("window created and handlers exposed") def _on_shown(): _log("on_shown: entered") # Compact mode: WinForms stretches our small create_window - # request to its OS-imposed minimum width (~132+ logical px), - # producing a wide pill instead of the tight circle we want. + # request to its OS-imposed minimum width (~132+ logical px). # A programmatic window.resize() AFTER the form is alive - # bypasses that minimum — Form.Size setter doesn't go through - # the SM_CXMINTRACK clamp the way the initial size does. We - # then re-clip the (now smaller, square) window into a circle. + # bypasses that minimum. We size to COMPACT_MIN_W × COMPACT_MIN_H + # initially; once the page loads and the ResizeObserver fires, + # size_changed will resize the window to fit the natural + # content (orb-only until text streams in). if compact: try: - window.resize(COMPACT_SIZE, COMPACT_SIZE) + window.resize(COMPACT_MIN_W, COMPACT_MIN_H) + # Reposition to anchor top-right based on the actual + # initial size — without this WinForms may have placed + # the wider initial window further left than we want. + new_x = max( + 0, state.screen_w - COMPACT_MIN_W - state.right_margin + ) + try: + window.move(new_x, state.top_margin) + except Exception: + pass # Give WinForms one frame to actually realise the new # rect before _apply_rounded_region reads it — without # this the region clip runs against the old wide-pill - # geometry and we lose the circle shape. + # geometry. time.sleep(0.1) except Exception: pass - # Clip into a pill (or circle, in compact mode) and emit READY - # so the parent's show() unblocks. + # Clip into a stadium pill and emit READY so the parent's + # show() unblocks. _apply_rounded_region(title) # Compact indicator is purely visual — drop mouse input so the # user can click the desktop or any window underneath it. Only @@ -991,8 +1179,8 @@ def show(self) -> None: _stderr("banner subprocess never emitted READY") def update(self, text: str) -> None: - if self._compact: - return + # Both modes accept MSG now — the compact pill renders the + # thinking-stream text in its msg span and grows to fit. self._send({"cmd": "MSG", "text": text or ""}) def wait_for_next(self, timeout: float | None = None) -> bool: diff --git a/Auto_Use/windows_use/remote_connection/telegram/service.py b/Auto_Use/windows_use/remote_connection/telegram/service.py index 613b8ae..e71a961 100644 --- a/Auto_Use/windows_use/remote_connection/telegram/service.py +++ b/Auto_Use/windows_use/remote_connection/telegram/service.py @@ -47,6 +47,7 @@ from pathlib import Path from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup +from telegram.error import BadRequest from telegram.ext import ( Application, CommandHandler, @@ -695,12 +696,26 @@ def _run_agent(task, provider, model, chat_id, bot, loop): provider_keys.get(provider_key_name) if provider_key_name else None ) + # Pipe each step's formatted response (thinking + current_goal + + # memory + verdict, with action stripped) into the compact banner. + # The agent already calls text_callback at + # main_driver/service.py with exactly this content — same path the + # frontend's streamAgentText uses in app.py. update() forwards via + # the MSG stdin command to the banner subprocess; the call returns + # quickly so the agent loop never blocks on it. + def _banner_update(text: str) -> None: + try: + task_banner.update(text) + except Exception: + logger.warning("banner.update failed", exc_info=True) + agent = AgentService( provider=provider, model=model, save_conversation=False, thinking=True, api_key=provider_api_key, + text_callback=_banner_update, ) agent.process_request(task) # Stop the monitor BEFORE the done message so the final scratchpad @@ -734,6 +749,15 @@ def _run_agent(task, provider, model, chat_id, bot, loop): # ── entry points ───────────────────────────────────────────────────────────── +async def _on_error(update, context): + err = context.error + # Benign: user tapped the same inline button twice, so the edit produces + # identical content. Telegram rejects it; swallow quietly. + if isinstance(err, BadRequest) and "Message is not modified" in str(err): + return + logger.error("Unhandled exception in telegram handler", exc_info=err) + + def _build_telegram_app(token: str): """Build a python-telegram-bot Application with all our handlers wired. @@ -747,6 +771,7 @@ def _build_telegram_app(token: str): .post_init(_post_init) .build() ) + app.add_error_handler(_on_error) app.add_handler(CommandHandler("start", start_cmd)) app.add_handler(CommandHandler("reset", reset_cmd)) app.add_handler(CallbackQueryHandler(callback_handler)) diff --git a/frontend/script.js b/frontend/script.js index 7cf4bf2..4c46f3b 100644 --- a/frontend/script.js +++ b/frontend/script.js @@ -723,92 +723,111 @@ document.addEventListener('DOMContentLoaded', () => { } }; - // Milestone streaming - word by word, stacking vertically + // Milestone streaming — letter by letter, stacking vertically. Each + // char gets its own opacity-fade-in span so the line types out + // smoothly (matching the Telegram banner's typewriter feel). window.streamMilestone = (text) => { const milestoneStream = document.getElementById('milestoneStream'); if (!milestoneStream) return; - - // Create new milestone line + const milestoneLine = document.createElement('div'); milestoneLine.className = 'milestone-line'; milestoneStream.appendChild(milestoneLine); - - // Split text into words - const words = text.split(/\s+/).filter(w => w.length > 0); - let currentIndex = 0; - - // Fast streaming speed (milliseconds per word) - const speed = 30; - - const streamWord = () => { - if (currentIndex < words.length) { - // Add word with space - if (currentIndex > 0) { - milestoneLine.textContent += ' '; - } - milestoneLine.textContent += words[currentIndex]; - currentIndex++; - - // Auto-scroll to bottom - milestoneStream.parentElement.scrollTop = milestoneStream.parentElement.scrollHeight; - - setTimeout(streamWord, speed); - } + + // Array.from splits by code point so emoji (🧠 / 🎯 / ✅) stay intact. + const chars = Array.from(text); + let i = 0; + const CHAR_DELAY_MS = 5; + const FADE_MS = 60; + + const streamChar = () => { + if (i >= chars.length) return; + const span = document.createElement('span'); + span.textContent = chars[i]; + span.style.opacity = '0'; + span.style.transition = 'opacity ' + FADE_MS + 'ms ease-out'; + milestoneLine.appendChild(span); + requestAnimationFrame(() => { span.style.opacity = '1'; }); + + // Auto-scroll keeps the newest line pinned to the bottom. + milestoneStream.parentElement.scrollTop = milestoneStream.parentElement.scrollHeight; + + i++; + setTimeout(streamChar, CHAR_DELAY_MS); }; - - // Start streaming - streamWord(); + + streamChar(); }; - // Word-by-word streaming for agent text in the response strip + // Letter-by-letter streaming for agent text in the response strip. + // Each char is its own span with an opacity fade so the line types + // out smoothly. When a char would overflow the strip's right edge, + // we clear and restart with that same char at the left (matches the + // Telegram banner's pager behavior, just inside a single-line strip + // instead of a pill). let streamingTimeout = null; window.streamAgentText = (text) => { const agentText = document.getElementById('agentText'); const agentStrip = document.getElementById('agentResponseStrip'); - + if (!agentText || !agentStrip) return; - - // Make sure strip is visible + agentStrip.classList.add('active'); - - // Clear any existing streaming + if (streamingTimeout) { clearTimeout(streamingTimeout); } - - // Split text into words - const words = text.split(/\s+/).filter(w => w.length > 0); - let currentIndex = 0; - let currentLine = ''; - - // Speed in milliseconds per word (fast but readable) - const baseSpeed = 25; - - const streamWord = () => { - if (currentIndex < words.length) { - // Add next word to current line - const testLine = currentLine ? currentLine + ' ' + words[currentIndex] : words[currentIndex]; - - // Temporarily set to measure width - agentText.textContent = testLine; - - // Check if text overflows the container - if (agentText.scrollWidth > agentText.clientWidth) { - // Reset - start fresh from left with current word - currentLine = words[currentIndex]; - agentText.textContent = currentLine; - } else { - // Fits - keep adding - currentLine = testLine; + + // Array.from preserves emoji as single tokens. + const chars = Array.from(text); + let i = 0; + const CHAR_DELAY_MS = 4; + const FADE_MS = 60; + + // Start with a clean strip — previous step's text would otherwise + // be measured into the new overflow check. + agentText.textContent = ''; + + const appendCharFade = (ch) => { + const span = document.createElement('span'); + span.textContent = ch; + span.style.opacity = '0'; + span.style.transition = 'opacity ' + FADE_MS + 'ms ease-out'; + agentText.appendChild(span); + requestAnimationFrame(() => { span.style.opacity = '1'; }); + return span; + }; + + const streamChar = () => { + if (i >= chars.length) return; + + const span = appendCharFade(chars[i]); + + // Sync layout read forces reflow → we see whether this char + // pushed past the strip's right edge. If yes (and it isn't + // the only char), clear the strip and re-place this char at + // the left edge of a fresh line. + if (agentText.scrollWidth > agentText.clientWidth + 0.5) { + if (agentText.childElementCount === 1) { + // Single char wider than the strip — accept and move on + // so we don't loop forever. + i++; + streamingTimeout = setTimeout(streamChar, CHAR_DELAY_MS); + return; } - - currentIndex++; - streamingTimeout = setTimeout(streamWord, baseSpeed); + agentText.textContent = ''; + // Skip leading whitespace so the new line doesn't open + // with a hanging space. + while (i < chars.length && /\s/.test(chars[i])) i++; + streamChar(); + return; } + + i++; + streamingTimeout = setTimeout(streamChar, CHAR_DELAY_MS); }; - - // Start streaming - streamWord(); + + streamChar(); }; @@ -1819,8 +1838,10 @@ document.addEventListener('DOMContentLoaded', () => { if (_tickFadeTimer) { clearTimeout(_tickFadeTimer); _tickFadeTimer = null; } }; - // Per-word fade-in stagger. Higher = calmer reading pace. - const CLI_WORD_STAGGER_MS = 45; + // Per-character cadence — matches the Telegram banner's 8 ms feel. + const CLI_CHAR_STAGGER_MS = 8; + // Per-character fade-in duration (opacity 0→1). + const CLI_CHAR_FADE_MS = 60; // Hold a finished page (full pill width filled) before clearing for the next page. const CLI_PAGE_HOLD_MS = 550; // Hold between distinct lines (after the final page of a line completes). @@ -1832,9 +1853,10 @@ document.addEventListener('DOMContentLoaded', () => { // millisecond. We're explicitly OK with the UI lagging behind real time — // smoothness matters more than catching up. // - // Each line is "paginated": words stream left-to-right; when the next - // word would overflow the pill width, the current page holds, the - // output clears, and that word starts the next page from the left. + // Each line is "paginated": characters stream left-to-right letter by + // letter; when the next char would overflow the pill width, the current + // page holds, the output clears, and that char starts the next page + // from the left. const _cliPillRunners = new WeakMap(); function _getRunner(pill) { @@ -1859,10 +1881,12 @@ document.addEventListener('DOMContentLoaded', () => { return; } - const words = String(text).split(/\s+/).filter(w => w.length > 0); + // Array.from splits by code point so emoji stay intact (text.split('') + // would split them into surrogate halves). + const chars = Array.from(String(text)); const lineClass = stream === 'err' ? 'cli-line cli-line-err' : 'cli-line cli-line-out'; - if (words.length === 0) { + if (chars.length === 0) { runner.running = false; _pumpCliRunner(pill, runner); return; @@ -1878,7 +1902,7 @@ document.addEventListener('DOMContentLoaded', () => { let i = 0; const tick = () => { - if (i >= words.length) { + if (i >= chars.length) { // Final page rendered — hold, then drop the run flag so the // next queued line gets pulled. setTimeout(() => { @@ -1890,29 +1914,36 @@ document.addEventListener('DOMContentLoaded', () => { const isFirstOnPage = pageDiv.childElementCount === 0; const span = document.createElement('span'); - span.className = 'cli-word'; - span.textContent = (isFirstOnPage ? '' : ' ') + words[i]; + span.className = 'cli-char'; + span.textContent = chars[i]; + span.style.opacity = '0'; + span.style.transition = 'opacity ' + CLI_CHAR_FADE_MS + 'ms ease-out'; pageDiv.appendChild(span); - // Did this word push past the pill's right edge? Measure on the + // Did this char push past the pill's right edge? Measure on the // page div itself — `.cli-line` has overflow:hidden so the - // overflow doesn't propagate up to `.cli-output`. If the word - // doesn't fit (and it isn't the only word on this page), retract - // it, hold the current page, then start fresh with this word at + // overflow doesn't propagate up to `.cli-output`. If the char + // doesn't fit (and it isn't the only char on this page), retract + // it, hold the current page, then start fresh with this char at // the left edge of a new page. const overflowed = pageDiv.scrollWidth > pageDiv.clientWidth + 1; if (overflowed && !isFirstOnPage) { pageDiv.removeChild(span); setTimeout(() => { startNewPage(); - tick(); // retry placing this word as the start of the new page + // Skip leading whitespace so the new page doesn't open + // with a hanging space. + while (i < chars.length && /\s/.test(chars[i])) i++; + tick(); // retry placing this char as the start of the new page }, CLI_PAGE_HOLD_MS); return; } - // Word fits (or it's a lone oversized word we accept as-is). + // Char fits (or it's a lone oversized char we accept as-is). Kick + // off the opacity fade on the next frame. + requestAnimationFrame(() => { span.style.opacity = '1'; }); i++; - setTimeout(tick, CLI_WORD_STAGGER_MS); + setTimeout(tick, CLI_CHAR_STAGGER_MS); }; tick();