diff --git a/Auto_Use/macOS_use/remote_connection/telegram/banner.py b/Auto_Use/macOS_use/remote_connection/telegram/banner.py
new file mode 100644
index 0000000..0d8e9b9
--- /dev/null
+++ b/Auto_Use/macOS_use/remote_connection/telegram/banner.py
@@ -0,0 +1,880 @@
+# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# If you build on this project, please keep this header and credit
+# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
+# A small attribution goes a long way toward a healthy open-source
+# community — thank you for contributing.
+
+"""Interactive walkthrough banner for setup.py.
+
+A small always-on-top pill at the top-right of the screen that contains:
+ - the animated stop-orb on the left,
+ - a status message in the middle (multi-line capable; pill grows downward),
+ - a clickable "Next" button on the right (only visible when the script is
+ waiting for the user — hidden during processing steps).
+
+setup.py calls show() once, then alternates update("…") + wait_for_next()
+to pace the user. close() tears it down. The Next button is shown
+automatically inside wait_for_next() and hidden as soon as it returns, so
+callers don't have to manage visibility manually.
+
+The pill default height is the original 44px. When a long status message
+wraps to multiple lines a ResizeObserver in JS posts the new body height
+back to Python via a second WKScriptMessageHandler, and Python resizes the
+NSWindow (top edge anchored, height grows downward).
+
+Everything runs inside the existing Python process. pywebview's main-thread
+NSApplication run loop (started by webview.start() in app.py) is reused —
+AppKit work is dispatched onto it via PyObjCTools.AppHelper.callAfter so the
+Flask worker thread that runs setup.py never touches Cocoa directly.
+
+If Cocoa/PyObjC isn't importable for any reason the class becomes a no-op
+so the automation still completes without a banner.
+"""
+import logging
+import threading
+
+logger = logging.getLogger(__name__)
+
+try:
+ from Cocoa import (
+ NSPanel, NSColor, NSScreen,
+ NSBackingStoreBuffered, NSMakeRect,
+ )
+ from Foundation import NSObject
+ from WebKit import WKWebView, WKWebViewConfiguration
+ from PyObjCTools.AppHelper import callAfter
+ _COCOA_OK = True
+except Exception as e:
+ logger.warning(f"banner: Cocoa unavailable, popup disabled ({e})")
+ _COCOA_OK = False
+
+# Non-activating panel: clicks inside the WebView do NOT activate the Python
+# process, so the AutoUse main pywebview window can't pop over Safari while
+# the wizard is running. The panel still becomes key when a text input needs
+# keyboard focus (setBecomesKeyOnlyIfNeeded_).
+NSWindowStyleMaskNonactivatingPanel = 1 << 7 # 128
+NSStatusWindowLevel = 25
+
+
+BANNER_HTML = """
+
+
+
+
+
+
+
+
+
+
+
+Starting…
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+
+# Compact HTML — used when StatusBanner(compact=True). Just the orb in a tiny
+# circular pill, no message span, no Next button, no JS message handlers. The
+# centred PC monitor icon cross-fades with a Telegram paper-plane every ~5s
+# so the user can tell at a glance this is a Telegram-triggered task.
+COMPACT_HTML = """
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+
+if _COCOA_OK:
+ class _NonActivatingPanel(NSPanel):
+ """Borderless NSPanel that can still become key.
+
+ AppKit returns NO from -canBecomeKeyWindow for borderless panels by
+ default, which blocks WKWebView text inputs from ever receiving
+ keyboard focus (the user clicks the field and nothing happens).
+ Overriding to YES makes the field usable. NSWindowStyleMaskNonactivatingPanel
+ is still set on the instance, so becoming key still doesn't activate
+ this Python process — Safari stays in the foreground."""
+ def canBecomeKeyWindow(self):
+ return True
+
+
+ class _ClickableWebView(WKWebView):
+ """WKWebView that returns YES from acceptsFirstMouse:.
+
+ Without this, the first click after the panel loses key status
+ (e.g. user just clicked Safari) is swallowed by AppKit while it
+ promotes the panel back to key — the button click never fires, and
+ the user has to tap a second time. Returning YES tells AppKit to
+ forward the very first click straight to the view, so single-tap
+ works regardless of key-window state."""
+ def acceptsFirstMouse_(self, event):
+ return True
+
+
+ class _NextHandler(NSObject):
+ """WKScriptMessageHandler — fires self._event when JS posts to 'next_clicked'.
+
+ No custom init: PyObjC's bridged NSObject.init takes no args, so calling
+ NSObject.init(self) inside a subclass crashes with "Need 0 arguments,
+ got 1". Instead, allocate with the default init and set the event as a
+ plain Python attribute right after — PyObjC subclasses accept arbitrary
+ Python attributes just fine.
+ """
+ def userContentController_didReceiveScriptMessage_(self, controller, message):
+ try:
+ self._event.set()
+ except Exception:
+ pass
+
+ class _HeightHandler(NSObject):
+ """WKScriptMessageHandler — receives body.scrollHeight from JS and calls
+ the banner's _on_height_changed on the main thread (already the current
+ thread, since WK message delivery is on main)."""
+ def userContentController_didReceiveScriptMessage_(self, controller, message):
+ try:
+ banner = self._banner
+ if banner is not None:
+ banner._on_height_changed(int(message.body()))
+ except Exception:
+ pass
+
+ class _ChoiceHandler(NSObject):
+ """WKScriptMessageHandler for the two-button choice row. Stores the
+ clicked label ('left' or 'right') on self._value, then fires self._event."""
+ def userContentController_didReceiveScriptMessage_(self, controller, message):
+ try:
+ self._value = str(message.body())
+ self._event.set()
+ except Exception:
+ pass
+
+ class _SaveHandler(NSObject):
+ """WKScriptMessageHandler for the token input. Stores the typed string
+ on self._value, then fires self._event."""
+ def userContentController_didReceiveScriptMessage_(self, controller, message):
+ try:
+ self._value = str(message.body())
+ self._event.set()
+ except Exception:
+ pass
+
+ class _RevealHandler(NSObject):
+ """WKScriptMessageHandler fired by JS when the word-by-word setMsg
+ reveal finishes. Used to gate control-set visibility on stream
+ completion so buttons don't pop in mid-sentence."""
+ def userContentController_didReceiveScriptMessage_(self, controller, message):
+ try:
+ self._event.set()
+ except Exception:
+ pass
+else:
+ _NextHandler = None
+ _HeightHandler = None
+ _ChoiceHandler = None
+ _SaveHandler = None
+ _RevealHandler = None
+
+
+class StatusBanner:
+ W, MIN_H, MAX_H, TOP_MARGIN, RIGHT_MARGIN = 440, 44, 200, 56, 20
+ # Compact variant: just the orb, no msg / button / scripts. Fixed-size
+ # circular pill (W == H, radius == W/2). Used for "Telegram task running"
+ # indicator — pure visual, click-through. Sized to hug the 36 px orb with
+ # ~4 px breathing room — anything taller and the pill looks padded.
+ COMPACT_W = COMPACT_H = 44
+
+ def __init__(self, compact: bool = False):
+ self._compact = compact
+ self._window = None
+ self._webview = None
+ self._next_handler = None # strong refs so the JS-bridge handlers
+ self._height_handler = None # don't get GC'd
+ self._choice_handler = None
+ self._save_handler = None
+ self._reveal_handler = None
+ self._next_event = threading.Event()
+ self._choice_event = threading.Event()
+ self._save_event = threading.Event()
+ # Set initially: no streaming reveal is pending until update() is called.
+ # update() clears this; the JS reveal_done handler re-sets it.
+ self._reveal_event = threading.Event()
+ self._reveal_event.set()
+ self._current_h = self.COMPACT_H if compact else self.MIN_H
+
+ # ---- public API (callable from any thread) ----
+
+ def show(self):
+ if not _COCOA_OK:
+ return
+ callAfter(self._create)
+
+ def update(self, text):
+ # Compact pills have no msg span — silently no-op so callers don't
+ # have to branch.
+ if not _COCOA_OK or self._compact:
+ return
+ # A streaming reveal is about to start in JS; clear the event so any
+ # following wait_for_* call blocks until JS posts reveal_done.
+ self._reveal_event.clear()
+ callAfter(self._set_text, text)
+
+ # Cap the wait-for-reveal so a JS hiccup that drops the reveal_done
+ # message can never deadlock us. Realistic banner messages stream out
+ # in well under this — and shorter is better, because the wait is what
+ # the user experiences between the message finishing and the button
+ # showing.
+ _REVEAL_WAIT_SEC = 3.0
+
+ def _await_reveal(self):
+ """Block until the most recent update()'s reveal animation has
+ finished (or the safety timeout fires). No-op if no update() is
+ pending — the event stays set in that case."""
+ self._reveal_event.wait(self._REVEAL_WAIT_SEC)
+
+ def wait_for_next(self, timeout=None):
+ """Block calling thread until user clicks Next (or timeout). Returns True if clicked.
+
+ Shows the Next button on entry and hides it on exit, so during normal
+ update() calls the button stays hidden — only the entry/exit boundaries
+ of a wait_for_next show a clickable Next.
+ """
+ if not _COCOA_OK:
+ return True # no banner → don't block forever
+ if self._compact:
+ # No Next button in compact mode — return immediately so callers
+ # that accidentally chain it don't hang forever.
+ return True
+ # Clear the click event BEFORE the reveal wait. If we cleared after,
+ # any click that lands during streaming (rare, since the button is
+ # hidden until reveal finishes — but defensive) would be wiped here
+ # and the user would have to click a second time.
+ self._next_event.clear()
+ self._await_reveal()
+ callAfter(self._clear_extra_ui)
+ callAfter(self._set_next_visible, True)
+ clicked = self._next_event.wait(timeout)
+ callAfter(self._set_next_visible, False)
+ return clicked
+
+ def wait_for_choice(self, left_label, right_label, timeout=None):
+ """Show two side-by-side buttons; block until one is clicked.
+ Returns 'left' or 'right', or None on timeout / no Cocoa."""
+ if not _COCOA_OK or self._compact:
+ return None
+ self._choice_event.clear()
+ self._await_reveal()
+ callAfter(self._set_next_visible, False)
+ callAfter(self._show_choice, left_label, right_label)
+ clicked = self._choice_event.wait(timeout)
+ value = getattr(self._choice_handler, "_value", None) if clicked else None
+ callAfter(self._clear_extra_ui)
+ return value
+
+ def wait_for_input(self, save_label="Save", validate=None,
+ error_msg="Token can't be empty"):
+ """Show a text input + Save button; block until user submits a value
+ that passes `validate` (default: non-empty after strip). Failed
+ validation surfaces `error_msg` in red below the input and keeps
+ waiting. Returns the accepted value, or None on no Cocoa."""
+ if not _COCOA_OK or self._compact:
+ return None
+ if validate is None:
+ validate = lambda v: bool((v or "").strip())
+ self._save_event.clear()
+ self._await_reveal()
+ callAfter(self._set_next_visible, False)
+ callAfter(self._show_input, save_label)
+ try:
+ while True:
+ self._save_event.wait()
+ # _destroy() also sets the event — bail out if the banner
+ # has been torn down out from under us.
+ if self._webview is None:
+ return None
+ value = getattr(self._save_handler, "_value", "") or ""
+ if validate(value):
+ return value
+ callAfter(self._set_input_error, error_msg)
+ self._save_event.clear()
+ finally:
+ callAfter(self._clear_extra_ui)
+
+ def close(self):
+ if not _COCOA_OK:
+ return
+ callAfter(self._destroy)
+
+ # ---- main-thread implementations ----
+
+ def _create(self):
+ try:
+ scr = NSScreen.mainScreen().frame()
+ if self._compact:
+ w_px, h_px = self.COMPACT_W, self.COMPACT_H
+ corner = w_px / 2.0
+ html = COMPACT_HTML
+ ignores_mouse = True # click-through; purely visual
+ else:
+ w_px, h_px = self.W, self.MIN_H
+ corner = self.MIN_H / 2.0
+ html = BANNER_HTML
+ ignores_mouse = False
+ x = scr.size.width - w_px - self.RIGHT_MARGIN
+ y = scr.size.height - h_px - self.TOP_MARGIN
+ rect = NSMakeRect(x, y, w_px, h_px)
+
+ w = _NonActivatingPanel.alloc().initWithContentRect_styleMask_backing_defer_(
+ rect, NSWindowStyleMaskNonactivatingPanel,
+ NSBackingStoreBuffered, False,
+ )
+ w.setLevel_(NSStatusWindowLevel)
+ w.setOpaque_(False)
+ w.setBackgroundColor_(NSColor.clearColor())
+ w.setIgnoresMouseEvents_(ignores_mouse)
+ w.setHasShadow_(True)
+ w.setReleasedWhenClosed_(False)
+ # Panels normally hide when their app deactivates — we want the
+ # banner to stay visible the entire time Safari is in front.
+ # Leave becomesKeyOnlyIfNeeded at the NSPanel default (NO) so a
+ # click on the token input properly makes the panel key and the
+ # field accepts paste / typing. NonactivatingPanelMask means
+ # becoming key still doesn't activate the Python process.
+ try:
+ w.setHidesOnDeactivate_(False)
+ except Exception:
+ pass
+
+ content = w.contentView()
+ content.setWantsLayer_(True)
+ content.layer().setBackgroundColor_(
+ NSColor.colorWithCalibratedRed_green_blue_alpha_(1.0, 1.0, 1.0, 0.96).CGColor()
+ )
+ # Fixed at MIN_H/2 so the pill stays a stadium at default height
+ # and becomes a rounded-rectangle when the height grows to fit
+ # multi-line messages — cleaner than a fat oval. In compact mode
+ # we use W/2 → perfect circle.
+ content.layer().setCornerRadius_(corner)
+ content.layer().setMasksToBounds_(True)
+
+ cfg = WKWebViewConfiguration.alloc().init()
+
+ # JS→Python bridges only relevant in standard mode (compact pill
+ # has no Next button and a fixed size — no need for either handler).
+ if not self._compact:
+ nh = _NextHandler.alloc().init()
+ nh._event = self._next_event
+ cfg.userContentController().addScriptMessageHandler_name_(nh, "next_clicked")
+
+ hh = _HeightHandler.alloc().init()
+ hh._banner = self
+ cfg.userContentController().addScriptMessageHandler_name_(hh, "height_changed")
+
+ ch = _ChoiceHandler.alloc().init()
+ ch._event = self._choice_event
+ ch._value = None
+ cfg.userContentController().addScriptMessageHandler_name_(ch, "choice_clicked")
+
+ sh = _SaveHandler.alloc().init()
+ sh._event = self._save_event
+ sh._value = ""
+ cfg.userContentController().addScriptMessageHandler_name_(sh, "save_clicked")
+
+ rh = _RevealHandler.alloc().init()
+ rh._event = self._reveal_event
+ cfg.userContentController().addScriptMessageHandler_name_(rh, "reveal_done")
+ else:
+ nh = hh = ch = sh = rh = None
+
+ wv_rect = NSMakeRect(0, 0, w_px, h_px)
+ wv = _ClickableWebView.alloc().initWithFrame_configuration_(wv_rect, cfg)
+ try:
+ wv.setValue_forKey_(False, "drawsBackground")
+ except Exception:
+ pass
+ try:
+ wv.setWantsLayer_(True)
+ wv.layer().setBackgroundColor_(NSColor.clearColor().CGColor())
+ except Exception:
+ pass
+ # NSViewWidthSizable (2) | NSViewHeightSizable (16). When the
+ # window animates between sizes (multi-line message growing,
+ # collapsing back to single line), the WebView's frame follows
+ # the animation instead of snapping — that's what makes the
+ # pill grow/shrink as a smooth shape.
+ try:
+ wv.setAutoresizingMask_(2 | 16)
+ except Exception:
+ pass
+ wv.loadHTMLString_baseURL_(html, None)
+ content.addSubview_(wv)
+
+ w.orderFrontRegardless()
+ # Make the panel key on show so the first user click on Next
+ # registers as the button click — not as "promote panel to key".
+ # NonActivatingPanelMask means becoming key still doesn't
+ # activate this Python process, so Safari stays in front.
+ if not self._compact:
+ try:
+ w.makeKeyWindow()
+ except Exception:
+ pass
+ self._window, self._webview = w, wv
+ self._next_handler, self._height_handler = nh, hh
+ self._choice_handler, self._save_handler = ch, sh
+ self._reveal_handler = rh
+ self._current_h = h_px
+ except Exception as e:
+ logger.warning(f"banner: _create failed ({e})")
+
+ def _set_text(self, text):
+ try:
+ if self._webview is None:
+ return
+ safe = (str(text)
+ .replace("\\", "\\\\")
+ .replace("'", "\\'")
+ .replace("\n", " ")
+ .replace("\r", " "))
+ # Primary path: hand the full text to JS which animates it
+ # word-by-word and fires reveal_done when finished. Fallback:
+ # if the page-side script hasn't run yet (window.setMsg is
+ # undefined — happens for the very first update right after
+ # the WebView starts loading), set textContent directly and
+ # post reveal_done ourselves so wait_for_next doesn't sit on
+ # its safety timeout.
+ js = (f"if (window.setMsg) {{ setMsg('{safe}'); }}"
+ f" else {{"
+ f" var m = document.getElementById('msg');"
+ f" if (m) m.textContent = '{safe}';"
+ f" try {{ webkit.messageHandlers.reveal_done.postMessage(1); }}"
+ f" catch (e) {{}}"
+ f" }}")
+ self._webview.evaluateJavaScript_completionHandler_(js, None)
+ except Exception:
+ pass
+
+ def _set_next_visible(self, visible):
+ try:
+ if self._webview is None:
+ return
+ disp = "inline-block" if visible else "none"
+ js = (f"var b=document.getElementById('next'); "
+ f"if (b) b.style.display='{disp}';")
+ self._webview.evaluateJavaScript_completionHandler_(js, None)
+ except Exception:
+ pass
+
+ @staticmethod
+ def _js_escape(text):
+ return (str(text)
+ .replace("\\", "\\\\")
+ .replace("'", "\\'")
+ .replace("\n", " ")
+ .replace("\r", " "))
+
+ def _show_choice(self, left_label, right_label):
+ try:
+ if self._webview is None:
+ return
+ l = self._js_escape(left_label)
+ r = self._js_escape(right_label)
+ js = f"if (window.setChoice) setChoice('{l}', '{r}');"
+ self._webview.evaluateJavaScript_completionHandler_(js, None)
+ except Exception:
+ pass
+
+ def _show_input(self, save_label):
+ try:
+ if self._webview is None:
+ return
+ s = self._js_escape(save_label)
+ js = f"if (window.setInput) setInput('{s}');"
+ self._webview.evaluateJavaScript_completionHandler_(js, None)
+ except Exception:
+ pass
+
+ def _set_input_error(self, msg):
+ try:
+ if self._webview is None:
+ return
+ m = self._js_escape(msg or "")
+ js = f"if (window.setInputError) setInputError('{m}');"
+ self._webview.evaluateJavaScript_completionHandler_(js, None)
+ except Exception:
+ pass
+
+ def _clear_extra_ui(self):
+ try:
+ if self._webview is None:
+ return
+ js = "if (window.clearAll) clearAll();"
+ self._webview.evaluateJavaScript_completionHandler_(js, None)
+ except Exception:
+ pass
+
+ def _on_height_changed(self, requested_h):
+ """Resize the NSWindow to match the WebView's content height.
+
+ Top edge stays put — height grows downward by adjusting NSWindow's
+ bottom-left origin Y. Clamped to [MIN_H, MAX_H].
+ """
+ try:
+ if self._window is None:
+ return
+ new_h = max(self.MIN_H, min(int(requested_h), self.MAX_H))
+ if abs(new_h - self._current_h) < 1:
+ return
+ self._current_h = new_h
+ frame = self._window.frame()
+ # NSWindow origin is bottom-left; to keep top edge fixed while
+ # height changes, shift origin Y by (old_h - new_h).
+ new_y = frame.origin.y + frame.size.height - new_h
+ new_frame = NSMakeRect(frame.origin.x, new_y, frame.size.width, new_h)
+ self._window.setFrame_display_animate_(new_frame, True, True)
+ # The WebView resizes with the window via its autoresizingMask
+ # (set in _create), so no manual setFrame snap is needed here —
+ # snapping would override the in-flight animation and the pill
+ # would visually jump to its final size rather than morph.
+ except Exception as e:
+ logger.warning(f"banner: _on_height_changed failed ({e})")
+
+ def _destroy(self):
+ try:
+ if self._webview is not None:
+ try:
+ self._webview.stopLoading()
+ except Exception:
+ pass
+ try:
+ cfg = self._webview.configuration()
+ if cfg is not None:
+ uc = cfg.userContentController()
+ uc.removeScriptMessageHandlerForName_("next_clicked")
+ uc.removeScriptMessageHandlerForName_("height_changed")
+ uc.removeScriptMessageHandlerForName_("choice_clicked")
+ uc.removeScriptMessageHandlerForName_("save_clicked")
+ uc.removeScriptMessageHandlerForName_("reveal_done")
+ except Exception:
+ pass
+ if self._window is not None:
+ self._window.orderOut_(None)
+ except Exception:
+ pass
+ finally:
+ for ev in (self._next_event, self._choice_event,
+ self._save_event, self._reveal_event):
+ try:
+ ev.set()
+ except Exception:
+ pass
+ self._window = None
+ self._webview = None
+ self._next_handler = None
+ self._height_handler = None
+ self._choice_handler = None
+ self._save_handler = None
+ self._reveal_handler = None
diff --git a/Auto_Use/macOS_use/remote_connection/telegram/service.py b/Auto_Use/macOS_use/remote_connection/telegram/service.py
new file mode 100644
index 0000000..d6e539b
--- /dev/null
+++ b/Auto_Use/macOS_use/remote_connection/telegram/service.py
@@ -0,0 +1,847 @@
+# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# If you build on this project, please keep this header and credit
+# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
+# A small attribution goes a long way toward a healthy open-source
+# community — thank you for contributing.
+
+"""Telegram → AgentService bridge with a guided provider/model picker.
+
+Runs as a standalone process (not mounted into Flask). On the first message
+the bot asks you to pick a provider (limited to providers with a non-empty
+key in api_key.txt / .env), then a model (from the same MODEL_MAPPINGS the
+AutoUse frontend uses). Subsequent messages are dispatched as tasks to the
+agent with that provider/model. Picked provider/model persist for the whole
+chat session until you `/reset`.
+
+Token lookup order (first non-empty wins):
+ 1. TELEGRAM_BOT_TOKEN env var
+ 2. .env at the project root
+ 3. Auto_Use/api_key/api_key.txt
+
+Setup:
+ 1. @BotFather → /newbot → copy token.
+ 2. Paste it into .env OR api_key.txt as TELEGRAM_BOT_TOKEN=…
+ 3. Make sure at least one provider key (e.g. OPENROUTER_API_KEY=…) is set.
+ 4. python -m Auto_Use.macOS_use.remote_connection.telegram.service
+ 5. On phone: open Telegram, find your bot, send any message.
+"""
+import asyncio
+import datetime
+import importlib
+import logging
+import sys
+import threading
+from pathlib import Path
+
+from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
+from telegram.ext import (
+ Application,
+ CommandHandler,
+ MessageHandler,
+ CallbackQueryHandler,
+ filters,
+)
+
+logger = logging.getLogger(__name__)
+
+# The Telegram surface treats api_key.txt as its single source of truth — we
+# deliberately do NOT consult .env or env vars here. .env is app.py's general
+# env-loading concern; keeping the bot self-contained against api_key.txt
+# avoids two-files-of-record confusion.
+#
+# Resolve api_key.txt the same way app.py's get_auto_use_path() does: in a
+# compiled/frozen build __file__ points INSIDE the bundle, so the parents[4]
+# walk would miss the editable api_key.txt that lives next to the executable
+# (the one the Settings panel and the regular agent use). Fall back to the
+# source-tree path in dev (python app.py).
+_IS_COMPILED = getattr(sys, "frozen", False) or "__compiled__" in globals()
+if _IS_COMPILED:
+ _API_KEY_FILE = Path(sys.executable).parent / "Auto_Use" / "api_key" / "api_key.txt"
+else:
+ # service.py → telegram → remote_connection → macOS_use → Auto_Use → repo root
+ _API_KEY_FILE = (
+ Path(__file__).resolve().parents[4] / "Auto_Use" / "api_key" / "api_key.txt"
+ )
+
+# Agent writes per-step "milestone" lines here. We tail this file during a
+# task and forward each new line back to the user's Telegram chat so they
+# see the agent's progress in real time.
+SCRATCHPAD_PATH = (
+ Path(__file__).resolve().parents[2] / "scratchpad" / "milestone" / "milestone.md"
+)
+SCRATCHPAD_POLL_SEC = 2.0
+MAX_TG_MSG_LEN = 4000 # Telegram caps at 4096; leave headroom for safety
+
+# Provider id → API-key name in the KV files. Same mapping the Windows side
+# uses ([windows_use/remote_connection/telegram/service.py:44-51]).
+PROVIDER_KEY_MAP = {
+ "openrouter": "OPENROUTER_API_KEY",
+ "groq": "GROQ_API_KEY",
+ "openai": "OPENAI_API_KEY",
+ "anthropic": "ANTHROPIC_API_KEY",
+ "google": "GOOGLE_API_KEY",
+ "perplexity": "PERPLEXITY_API_KEY",
+}
+
+
+# ── file helpers ─────────────────────────────────────────────────────────────
+
+def _read_all_keys(path: Path) -> dict:
+ """Parse a simple KEY=VALUE file (one per line) into a dict. Skips empty
+ values and lines starting with '#'."""
+ out = {}
+ if not path.exists():
+ return out
+ try:
+ with open(path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line or line.startswith("#") or "=" not in line:
+ continue
+ k, _, v = line.partition("=")
+ k, v = k.strip(), v.strip()
+ if v:
+ out[k] = v
+ except Exception:
+ pass
+ return out
+
+
+def _resolve_token() -> str | None:
+ """Read TELEGRAM_BOT_TOKEN from api_key.txt only. .env and env vars are
+ intentionally ignored — see header comment."""
+ return _read_all_keys(_API_KEY_FILE).get("TELEGRAM_BOT_TOKEN")
+
+
+def _get_available_providers() -> list:
+ """Providers with a non-empty key in api_key.txt only."""
+ keys = _read_all_keys(_API_KEY_FILE)
+ return [
+ {"id": pid, "key": keys[kname]}
+ for pid, kname in PROVIDER_KEY_MAP.items()
+ if keys.get(kname)
+ ]
+
+
+def _set_key_in_file(path: Path, key: str, value: str) -> None:
+ """Write/update KEY=value in a KV file, preserving every other line.
+
+ Unlike a naive read-all-and-write-back-with-_read_all_keys, this keeps
+ empty-value placeholder lines (e.g. GROQ_API_KEY=) intact — the AutoUse
+ UI relies on those for its provider list rendering.
+ """
+ lines = []
+ found = False
+ if path.exists():
+ try:
+ with open(path, "r", encoding="utf-8") as f:
+ for raw in f:
+ stripped = raw.strip()
+ if stripped.startswith(f"{key}="):
+ lines.append(f"{key}={value}\n")
+ found = True
+ else:
+ lines.append(raw if raw.endswith("\n") else raw + "\n")
+ except Exception:
+ logger.warning("failed to read %s while updating %s", path, key)
+ return
+ if not found:
+ lines.append(f"{key}={value}\n")
+ try:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with open(path, "w", encoding="utf-8") as f:
+ f.writelines(lines)
+ except Exception:
+ logger.warning("failed to write %s", path)
+
+
+def _resolve_owner_chat_id() -> int | None:
+ """Owner chat_id = whoever last sent /start. Stored in api_key.txt as
+ TELEGRAM_OWNER_CHAT_ID=…, so it survives restarts."""
+ val = _read_all_keys(_API_KEY_FILE).get("TELEGRAM_OWNER_CHAT_ID")
+ if not val:
+ return None
+ try:
+ return int(val)
+ except ValueError:
+ return None
+
+
+def _save_owner_chat_id(chat_id: int) -> None:
+ """Persist the owner chat_id so we can message them on the next boot."""
+ _set_key_in_file(_API_KEY_FILE, "TELEGRAM_OWNER_CHAT_ID", str(chat_id))
+
+
+def _get_models_for_provider(provider_id: str) -> list:
+ """Read MODEL_MAPPINGS from Auto_Use/macOS_use/llm_provider//view.py
+ and return non-hidden entries as [{id, display_name}, …]."""
+ try:
+ mod = importlib.import_module(
+ f"Auto_Use.macOS_use.llm_provider.{provider_id}.view"
+ )
+ mappings = getattr(mod, "MODEL_MAPPINGS", {})
+ return [
+ {"id": mid, "display_name": info.get("display_name", mid)}
+ for mid, info in mappings.items()
+ if not info.get("hidden", False)
+ ]
+ except Exception:
+ return []
+
+
+# ── per-chat state ───────────────────────────────────────────────────────────
+
+# chat_id → {
+# "phase": "idle" | "pick_provider" | "pick_model" | "ready" | "running",
+# "provider": str | None,
+# "model": str | None,
+# "model_display": str | None,
+# "queue": list[str], # tasks waiting to run, FIFO
+# "pending": dict[str, str], # pending_id → task awaiting Yes/No
+# "pending_counter": int, # monotonic id source for pending
+# }
+_chat_state: dict = {}
+
+# Guards mutations that read+modify state across threads (queue drain races
+# between _run_agent's finally and the callback handler tapping "Yes").
+_state_lock = threading.Lock()
+
+
+def _state(chat_id: int) -> dict:
+ return _chat_state.setdefault(chat_id, {"phase": "idle"})
+
+
+def _maybe_run_next_queued(chat_id: int, bot, loop) -> None:
+ """If this chat is ready and has a queued task, pop the next one and
+ start it. Threadsafe — called from both _run_agent's finally (worker
+ thread) and the q+ callback (asyncio loop)."""
+ with _state_lock:
+ state = _chat_state.get(chat_id)
+ if not state:
+ return
+ if state.get("phase") != "ready":
+ return
+ queue = state.get("queue") or []
+ if not queue:
+ return
+ provider = state.get("provider")
+ model = state.get("model")
+ if not provider or not model:
+ return
+ next_task = queue.pop(0)
+ display = state.get("model_display") or model
+ state["phase"] = "running"
+
+ _send_chat(
+ bot,
+ chat_id,
+ f"📝 Running queued task: {next_task[:200]} ({provider} · {display})",
+ loop,
+ )
+ threading.Thread(
+ target=_run_agent,
+ args=(next_task, provider, model, chat_id, bot, loop),
+ daemon=True,
+ name=f"telegram-agent-{chat_id}-queued",
+ ).start()
+
+
+# ── Telegram handlers ────────────────────────────────────────────────────────
+
+def _build_online_text(providers: list) -> str:
+ now_str = datetime.datetime.now().strftime("%H:%M:%S")
+ if providers:
+ provider_line = ", ".join(p["id"] for p in providers)
+ return f"🟢 AutoUse online at {now_str}\nProviders: {provider_line}"
+ return f"🟢 AutoUse online at {now_str}\nProviders: (none configured)"
+
+
+async def _show_provider_picker(message):
+ providers = _get_available_providers()
+ # Always lead with the "AutoUse online" status line so the user gets the
+ # same greeting they'd see at app boot, even when they message the bot
+ # first instead of waiting for the unsolicited startup announcement.
+ await message.reply_text(_build_online_text(providers))
+ if not providers:
+ await message.reply_text(
+ "⚠️ No provider API keys found. Add at least one (e.g. "
+ "OPENROUTER_API_KEY=…) to api_key.txt or .env and try again."
+ )
+ return False
+ buttons = [
+ [InlineKeyboardButton(p["id"], callback_data=f"provider:{p['id']}")]
+ for p in providers
+ ]
+ await message.reply_text(
+ "👋 Pick a provider:", reply_markup=InlineKeyboardMarkup(buttons)
+ )
+ return True
+
+
+async def _discover_owner_from_updates(bot) -> int | None:
+ """Peek at the latest pending update on Telegram's servers and use its
+ chat_id as the owner. Lets the bot self-bootstrap on the very first run
+ after the chat-saving code was deployed, without requiring the user to
+ /start again. Safe to call before start_polling — uses offset=-1 which
+ Telegram supports as 'just the most recent update', and doesn't consume
+ updates from the polling updater's offset cursor."""
+ try:
+ updates = await bot.get_updates(offset=-1, limit=1, timeout=2)
+ except Exception:
+ logger.warning("owner discovery: get_updates failed", exc_info=True)
+ return None
+ for upd in updates:
+ chat = getattr(upd, "effective_chat", None)
+ if chat and chat.id:
+ return int(chat.id)
+ return None
+
+
+async def _post_init(application) -> None:
+ """Fires once after the bot finishes initialising (before polling starts).
+ Used to message the saved owner: 'AutoUse online at …' + a fresh provider
+ picker — so the user doesn't have to send anything to get going."""
+ owner_id = _resolve_owner_chat_id()
+ if not owner_id:
+ # Not saved yet — try to auto-discover from Telegram's pending updates.
+ # Works if the user has ever messaged the bot, even before the
+ # chat-saving code was deployed. Persist the result so we don't need
+ # to re-discover on every boot.
+ owner_id = await _discover_owner_from_updates(application.bot)
+ if owner_id:
+ try:
+ _save_owner_chat_id(owner_id)
+ logger.info(
+ "owner discovery: saved chat_id=%s from getUpdates",
+ owner_id,
+ )
+ except Exception:
+ logger.warning("owner discovery: could not persist chat_id", exc_info=True)
+ if not owner_id:
+ # No owner anywhere — they've never interacted with the bot. Stay
+ # silent; they'll register themselves with /start.
+ return
+ bot = application.bot
+ providers = _get_available_providers()
+ try:
+ await bot.send_message(chat_id=owner_id, text=_build_online_text(providers))
+ except Exception:
+ logger.exception("startup announcement: failed to send hello")
+ return # if we can't even greet, don't bother with the picker
+
+ if not providers:
+ try:
+ await bot.send_message(
+ chat_id=owner_id,
+ text="⚠️ No provider API keys found. Add at least one to api_key.txt and /reset.",
+ )
+ except Exception:
+ pass
+ return
+
+ buttons = [
+ [InlineKeyboardButton(p["id"], callback_data=f"provider:{p['id']}")]
+ for p in providers
+ ]
+ try:
+ await bot.send_message(
+ chat_id=owner_id,
+ text="👋 Pick a provider:",
+ reply_markup=InlineKeyboardMarkup(buttons),
+ )
+ # Park the owner's chat in pick_provider so the next button tap routes
+ # cleanly through the existing callback flow.
+ _chat_state[owner_id] = {"phase": "pick_provider"}
+ except Exception:
+ logger.exception("startup announcement: failed to send provider picker")
+
+
+async def start_cmd(update, ctx):
+ chat_id = update.effective_chat.id
+ # Remember this chat so future boots can auto-greet (Phase 10 startup
+ # announcement). Best-effort — never let a file-write failure block /start.
+ try:
+ _save_owner_chat_id(chat_id)
+ except Exception:
+ logger.warning("could not persist owner chat_id", exc_info=True)
+ _chat_state[chat_id] = {"phase": "pick_provider"}
+ ok = await _show_provider_picker(update.message)
+ if not ok:
+ _chat_state[chat_id] = {"phase": "idle"}
+
+
+async def reset_cmd(update, ctx):
+ # Wipe state for this chat — including any queued tasks and pending
+ # awaiting Yes/No prompts. We do NOT clear the persisted owner chat_id;
+ # /reset is "start over the conversation", not "forget I exist".
+ _chat_state[update.effective_chat.id] = {"phase": "idle"}
+ await update.message.reply_text(
+ "🔄 Reset. Send any message to pick a provider again."
+ )
+
+
+async def text_handler(update, ctx):
+ chat_id = update.effective_chat.id
+ # Persist on every message, not just /start, so the next app boot can
+ # auto-announce "AutoUse online" without the user having to /start first.
+ try:
+ _save_owner_chat_id(chat_id)
+ except Exception:
+ logger.warning("could not persist owner chat_id", exc_info=True)
+ state = _state(chat_id)
+ phase = state.get("phase", "idle")
+
+ if phase in ("idle", "pick_provider"):
+ state["phase"] = "pick_provider"
+ ok = await _show_provider_picker(update.message)
+ if not ok:
+ state["phase"] = "idle"
+ return
+
+ if phase == "pick_model":
+ await update.message.reply_text(
+ "Pick a model from the buttons above first."
+ )
+ return
+
+ if phase == "running":
+ # Busy — offer to queue this task. Each pending prompt gets a unique
+ # id so multiple "queue this?" prompts can coexist if the user spams.
+ task = (update.message.text or "").strip()
+ if not task:
+ return
+ state.setdefault("pending", {})
+ state["pending_counter"] = state.get("pending_counter", 0) + 1
+ pending_id = str(state["pending_counter"])
+ state["pending"][pending_id] = task
+ buttons = [[
+ InlineKeyboardButton("✅ Yes, queue it", callback_data=f"q+:{pending_id}"),
+ InlineKeyboardButton("❌ No", callback_data=f"q-:{pending_id}"),
+ ]]
+ await update.message.reply_text(
+ f"⏳ Currently busy performing a task.\n"
+ f"Do you want to queue: \"{task[:200]}\" ?",
+ reply_markup=InlineKeyboardMarkup(buttons),
+ )
+ return
+
+ # phase == "ready"
+ task = (update.message.text or "").strip()
+ if not task:
+ return
+ state["phase"] = "running"
+ provider = state["provider"]
+ model = state["model"]
+ display = state.get("model_display", model)
+ await update.message.reply_text(
+ f"📝 Running: {task} ({provider} · {display})"
+ )
+ bot = ctx.bot
+ loop = asyncio.get_running_loop()
+ threading.Thread(
+ target=_run_agent,
+ args=(task, provider, model, chat_id, bot, loop),
+ daemon=True,
+ ).start()
+
+
+async def callback_handler(update, ctx):
+ query = update.callback_query
+ await query.answer()
+ chat_id = query.message.chat_id
+ try:
+ _save_owner_chat_id(chat_id)
+ except Exception:
+ logger.warning("could not persist owner chat_id", exc_info=True)
+ state = _state(chat_id)
+ data = query.data or ""
+
+ if data.startswith("provider:"):
+ provider_id = data.split(":", 1)[1]
+ state["provider"] = provider_id
+ state["phase"] = "pick_model"
+ models = _get_models_for_provider(provider_id)
+ if not models:
+ state["phase"] = "pick_provider"
+ await query.edit_message_text(
+ f"⚠️ No models found for {provider_id}. Pick another provider."
+ )
+ return
+ buttons = [
+ [InlineKeyboardButton(m["display_name"], callback_data=f"model:{m['id']}")]
+ for m in models
+ ]
+ await query.edit_message_text(
+ f"Pick a model for {provider_id}:",
+ reply_markup=InlineKeyboardMarkup(buttons),
+ )
+ return
+
+ if data.startswith("model:"):
+ model_id = data.split(":", 1)[1]
+ provider_id = state.get("provider")
+ if not provider_id:
+ state["phase"] = "idle"
+ await query.edit_message_text("Session expired. Send any message to start over.")
+ return
+ models = _get_models_for_provider(provider_id)
+ display = next(
+ (m["display_name"] for m in models if m["id"] == model_id), model_id
+ )
+ state["model"] = model_id
+ state["model_display"] = display
+ state["phase"] = "ready"
+ await query.edit_message_text(
+ f"✅ Provider: {provider_id} / Model: {display}\n"
+ f"Send me a task whenever you're ready."
+ )
+ return
+
+ if data.startswith("q+:"):
+ # User wants to queue the pending task.
+ pending_id = data.split(":", 1)[1]
+ task = (state.get("pending") or {}).pop(pending_id, None)
+ if not task:
+ await query.edit_message_text("(That prompt has already been handled.)")
+ return
+ state.setdefault("queue", []).append(task)
+ qlen = len(state["queue"])
+ await query.edit_message_text(
+ f"📥 Queued (position {qlen}): \"{task[:200]}\"\n"
+ f"Will run when the current task finishes."
+ )
+ # Edge case: agent finished in the milliseconds between the prompt
+ # being sent and the user tapping Yes. Drain the queue now so the
+ # queued task isn't stranded.
+ _maybe_run_next_queued(chat_id, ctx.bot, asyncio.get_running_loop())
+ return
+
+ if data.startswith("q-:"):
+ # User declines to queue. Drop the pending task.
+ pending_id = data.split(":", 1)[1]
+ (state.get("pending") or {}).pop(pending_id, None)
+ await query.edit_message_text(
+ "👍 OK, won't queue it. I'll let you know once the current task is done."
+ )
+ return
+
+
+# ── scratchpad streaming ─────────────────────────────────────────────────────
+
+def _send_chat(bot, chat_id, text, loop, wait: bool = False, timeout: float = 5.0):
+ """Schedule a bot.send_message on the asyncio loop from a worker thread.
+ Silently ignores failures so a transient send error never kills the
+ monitor thread.
+
+ When wait=True, block the calling thread until the send actually
+ completes (or `timeout` seconds elapse). Used for terminal messages
+ like "✅ Done." that must land in the chat BEFORE the next message
+ is scheduled — without it, the "Done" send and the "Running queued
+ task" send race inside the asyncio loop as two parallel HTTP POSTs
+ and Telegram can deliver them out of order."""
+ try:
+ fut = asyncio.run_coroutine_threadsafe(
+ bot.send_message(chat_id=chat_id, text=text), loop
+ )
+ if wait:
+ try:
+ fut.result(timeout=timeout)
+ except Exception:
+ logger.warning(
+ "send_message to chat %s did not confirm within %ss",
+ chat_id, timeout, exc_info=True,
+ )
+ except Exception:
+ logger.warning("Failed to schedule send_message to chat %s", chat_id)
+
+
+def _monitor_scratchpad(chat_id, bot, loop, stop_event, start_pos):
+ """Tail SCRATCHPAD_PATH and forward each new non-empty line to the chat.
+
+ Polls every SCRATCHPAD_POLL_SEC seconds. start_pos is the byte offset
+ the file was at when the task began — we only forward content written
+ AFTER that, so old milestones from previous tasks aren't replayed.
+ Exits when stop_event is set, after one final sweep to flush any tail.
+ """
+ last_pos = start_pos
+
+ def _read_and_forward():
+ nonlocal last_pos
+ if not SCRATCHPAD_PATH.exists():
+ # File was deleted (e.g. AgentService.__init__ wiping the
+ # scratchpad). Reset so the next poll re-reads the whole new
+ # file from the top instead of seeking past its end.
+ last_pos = 0
+ return
+ try:
+ # Defensive: if the file shrank below last_pos it was truncated
+ # or rotated; restart from byte 0 so we don't slice into the
+ # middle of fresh content and stream a fragment.
+ try:
+ current_size = SCRATCHPAD_PATH.stat().st_size
+ if current_size < last_pos:
+ last_pos = 0
+ except Exception:
+ pass
+ with open(SCRATCHPAD_PATH, "r", encoding="utf-8", errors="replace") as f:
+ f.seek(last_pos)
+ new_content = f.read()
+ if not new_content:
+ return
+ last_pos = f.tell()
+ except Exception as exc:
+ logger.warning("Scratchpad read error: %s", exc)
+ return
+ for raw in new_content.splitlines():
+ line = raw.strip()
+ if not line:
+ continue
+ # Chunk excessively long lines so we stay under Telegram's 4096 cap.
+ for i in range(0, len(line), MAX_TG_MSG_LEN):
+ _send_chat(bot, chat_id, line[i : i + MAX_TG_MSG_LEN], loop)
+
+ while not stop_event.is_set():
+ _read_and_forward()
+ stop_event.wait(SCRATCHPAD_POLL_SEC)
+
+ # Final sweep — catches any line written between the last poll and the
+ # stop_event being set (e.g. the agent's very last milestone).
+ _read_and_forward()
+
+
+# ── agent runner (worker thread) ─────────────────────────────────────────────
+
+def _run_agent(task, provider, model, chat_id, bot, loop):
+ """Run the agent and ping the chat when done. Streams scratchpad milestones
+ back to the chat live while the agent works. Pops a compact pill so the
+ Mac user can see a Telegram task is running, and minimises the main app
+ window so the agent has the screen to itself. Restores phase to 'ready'."""
+ # Compact "Telegram task in progress" indicator + minimise AutoUse window.
+ # Both are best-effort — never let UI fluff block the actual task.
+ from Auto_Use.macOS_use.remote_connection.telegram.banner import StatusBanner
+ task_banner = StatusBanner(compact=True)
+ try:
+ task_banner.show()
+ except Exception:
+ logger.warning("could not show task banner", exc_info=True)
+ # Minimise the AutoUse pywebview window so the agent has the screen to
+ # itself. We talk to pywebview directly via its global `windows` list
+ # rather than importing from app.py — `python app.py` makes app.py the
+ # __main__ module, so `from app import …` would re-import a *second*
+ # copy of app.py whose webview_window is still None, and the call would
+ # silently no-op.
+ try:
+ import webview as _webview
+ if _webview.windows:
+ _webview.windows[0].minimize()
+ except Exception:
+ logger.warning("could not minimise AutoUse window", exc_info=True)
+
+ # Reset the milestone scratchpad to empty before starting the monitor.
+ # AgentService.__init__ wipes the entire scratchpad/ directory in
+ # _cleanup_scratchpad() — so if we snapshotted the file's current size
+ # here and the agent then deleted + rewrote it, the monitor's last_pos
+ # would point mid-way into the fresh content and we'd stream a
+ # fragment (e.g. "ome." instead of "Verified: …Chrome.") to the chat.
+ # Deleting the file ourselves up front and starting from byte 0 keeps
+ # the monitor aligned with whatever the agent writes next. Best-effort
+ # — a failure here just degrades us back to the old (buggy) behavior.
+ try:
+ if SCRATCHPAD_PATH.exists():
+ SCRATCHPAD_PATH.unlink()
+ except Exception:
+ logger.warning("could not reset milestone scratchpad", exc_info=True)
+ start_pos = 0
+ stop_event = threading.Event()
+ monitor = threading.Thread(
+ target=_monitor_scratchpad,
+ args=(chat_id, bot, loop, stop_event, start_pos),
+ daemon=True,
+ name=f"telegram-scratchpad-{chat_id}",
+ )
+ monitor.start()
+
+ try:
+ # Imported lazily — pulls in tree/element → skimage etc., which we
+ # don't want to load until a task actually runs.
+ from Auto_Use.macOS_use.agent.service import AgentService
+
+ # Look up the runtime API key for the chosen provider so LLMManager
+ # doesn't fall back to an os.getenv() the user never set. Telegram
+ # users edit api_key.txt (or the AutoUse Settings panel), not env
+ # vars — and the compiled build has no .env — so without passing
+ # api_key= here the agent dies with "X API key not provided and not
+ # found in .env file". _get_available_providers already gated the
+ # picker to non-empty keys, so this lookup returns a value.
+ provider_key_name = PROVIDER_KEY_MAP.get(provider)
+ provider_keys = _read_all_keys(_API_KEY_FILE)
+ provider_api_key = (
+ provider_keys.get(provider_key_name) if provider_key_name else None
+ )
+
+ agent = AgentService(
+ provider=provider,
+ model=model,
+ save_conversation=False,
+ thinking=True,
+ api_key=provider_api_key,
+ )
+ agent.process_request(task)
+ # Stop the monitor BEFORE the done message so the final scratchpad
+ # sweep happens first — keeps the chat in correct chronological order.
+ stop_event.set()
+ monitor.join(timeout=SCRATCHPAD_POLL_SEC + 2)
+ # wait=True: block until "✅ Done." is on Telegram's servers before
+ # the finally-block fires _maybe_run_next_queued, which would
+ # otherwise schedule "📝 Running queued task: …" as a second,
+ # concurrent HTTP POST that can race past Done in delivery.
+ _send_chat(bot, chat_id, "✅ Done.", loop, wait=True)
+ except Exception as e:
+ logger.exception("agent error")
+ stop_event.set()
+ monitor.join(timeout=SCRATCHPAD_POLL_SEC + 2)
+ _send_chat(bot, chat_id, f"❌ Error: {e}", loop, wait=True)
+ finally:
+ if not stop_event.is_set():
+ stop_event.set()
+ try:
+ task_banner.close()
+ except Exception:
+ pass
+ with _state_lock:
+ state = _chat_state.get(chat_id)
+ if state is not None and state.get("phase") == "running":
+ state["phase"] = "ready"
+ # Drain one queued task if any — keeps phase='running' if it spawns.
+ _maybe_run_next_queued(chat_id, bot, loop)
+
+
+# ── entry points ─────────────────────────────────────────────────────────────
+
+def _build_telegram_app(token: str):
+ """Build a python-telegram-bot Application with all our handlers wired.
+
+ `post_init` is the hook python-telegram-bot calls once after the bot
+ finishes initialising but before polling starts — perfect spot to send
+ the "AutoUse online" announcement + provider picker to the saved owner.
+ """
+ app = (
+ Application.builder()
+ .token(token)
+ .post_init(_post_init)
+ .build()
+ )
+ app.add_handler(CommandHandler("start", start_cmd))
+ app.add_handler(CommandHandler("reset", reset_cmd))
+ app.add_handler(CallbackQueryHandler(callback_handler))
+ app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, text_handler))
+ return app
+
+
+_BOT_THREAD: threading.Thread | None = None
+
+
+def _stderr(msg: str) -> None:
+ """Loud print to the terminal where python app.py is running — bypasses
+ whatever logging config is in effect so the user actually sees it."""
+ import sys
+ print(f"[telegram] {msg}", file=sys.stderr, flush=True)
+
+
+async def _run_bot_until_stopped(tg_app):
+ """Manual lifecycle replacement for Application.run_polling().
+
+ run_polling() messes with signals and assumes it owns the main thread;
+ we want to drive it from a worker thread so we do it step by step.
+
+ Order matches what run_polling() does internally:
+ initialize → start → post_init → start_polling.
+ We call _post_init BEFORE start_polling so its bot.get_updates(offset=-1)
+ auto-discovery doesn't race with the updater's own polling loop.
+ """
+ await tg_app.initialize()
+ await tg_app.start()
+ # Application.post_init() is only invoked by run_polling(), not by the
+ # manual initialize+start path above. Call our startup announcement
+ # explicitly so the saved owner gets the "AutoUse online" message.
+ try:
+ await _post_init(tg_app)
+ except Exception:
+ logger.exception("post_init failed")
+ await tg_app.updater.start_polling(allowed_updates=Update.ALL_TYPES)
+ _stderr("polling loop is live — send your bot a message")
+ # Park here forever (daemon thread; killed on app exit).
+ await asyncio.Event().wait()
+
+
+def start_bot() -> None:
+ """Start the Telegram bot polling on a daemon thread.
+
+ Idempotent — safe to call multiple times from app.py boot. Prints loudly
+ to stderr at each milestone so the user can see what's happening.
+ """
+ global _BOT_THREAD
+ if _BOT_THREAD is not None and _BOT_THREAD.is_alive():
+ _stderr("start_bot() called but the bot is already running — skipping.")
+ return
+ token = _resolve_token()
+ if not token:
+ _stderr(
+ "BOT NOT STARTED — TELEGRAM_BOT_TOKEN not found in env, .env, or "
+ "api_key.txt. Paste your @BotFather token into one of those files."
+ )
+ return
+ _stderr(f"starting bot (token ends in …{token[-6:]})")
+
+ def _runner():
+ import sys, traceback
+ try:
+ # Each thread needs its own asyncio event loop. Without this, the
+ # call to asyncio.Event() inside _run_bot_until_stopped fails.
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ tg_app = _build_telegram_app(token)
+ try:
+ loop.run_until_complete(_run_bot_until_stopped(tg_app))
+ finally:
+ loop.close()
+ except Exception as e:
+ _stderr(f"BOT CRASHED: {e!r}")
+ traceback.print_exc(file=sys.stderr)
+
+ _BOT_THREAD = threading.Thread(target=_runner, daemon=True, name="telegram-bot")
+ _BOT_THREAD.start()
+
+
+def main():
+ """Standalone entry — for testing without launching the full AutoUse app."""
+ token = _resolve_token()
+ if not token:
+ raise SystemExit(
+ f"TELEGRAM_BOT_TOKEN not found in {_API_KEY_FILE}\n"
+ "(create the bot via @BotFather first, then add the token to that file)."
+ )
+ tg_app = _build_telegram_app(token)
+ logger.info("Telegram bot polling started (main thread)")
+ tg_app.run_polling(allowed_updates=Update.ALL_TYPES)
+
+
+if __name__ == "__main__":
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+ )
+ main()
diff --git a/Auto_Use/macOS_use/remote_connection/telegram/setup.py b/Auto_Use/macOS_use/remote_connection/telegram/setup.py
new file mode 100644
index 0000000..7d0a395
--- /dev/null
+++ b/Auto_Use/macOS_use/remote_connection/telegram/setup.py
@@ -0,0 +1,154 @@
+# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# If you build on this project, please keep this header and credit
+# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
+# A small attribution goes a long way toward a healthy open-source
+# community — thank you for contributing.
+
+"""Telegram remote-connection setup driver (macOS, guided mode).
+
+Opens Safari, navigates to web.telegram.org, then lets the user log in
+manually. Progress is paced by a small always-on-top banner that streams
+status text and has a Next button. The script blocks on user clicks via
+banner.wait_for_next() — the user does the actual login (phone, country,
+OTP) themselves; we just get them to the right page.
+"""
+import logging
+import os
+import time
+
+from Auto_Use.macOS_use.controller.tool.open_app import open_app
+from Auto_Use.macOS_use.tree.element import UIElementScanner, ELEMENT_CONFIG
+from Auto_Use.macOS_use.controller.service import ControllerService
+from Auto_Use.macOS_use.controller.key_combo.service import KeyComboService
+from Auto_Use.macOS_use.remote_connection.telegram.banner import StatusBanner
+from Auto_Use.macOS_use.remote_connection.telegram.service import (
+ _API_KEY_FILE, _set_key_in_file,
+)
+
+logger = logging.getLogger(__name__)
+
+TELEGRAM_WEB_URL = "web.telegram.org"
+STEP_DELAY_SEC = 2
+
+
+def _find_address_bar(mapping: dict) -> str | None:
+ """Return the index of Safari's smart-search field, or None if not found."""
+ for idx, info in mapping.items():
+ if info.get("name") == "smart search field" and info.get("type") == "TextField":
+ return idx
+ return None
+
+
+def _open_telegram_in_safari(banner) -> bool:
+ """Launch Safari and navigate it to web.telegram.org.
+
+ Streams sub-step status to the banner so the user can see what's happening
+ while Safari takes focus. Returns False on any failure.
+ """
+ banner.update("Please wait — confirming Safari is open…")
+ if not open_app("Safari"):
+ logger.error("setup.py: failed to launch Safari")
+ return False
+ # open_app itself sleeps ~1 s after launching and then runs an AppleScript
+ # window-move, so the address bar isn't reliably there yet. One more
+ # second is enough for the smart-search field to settle before we scan.
+ time.sleep(1)
+
+ scanner = UIElementScanner(ELEMENT_CONFIG)
+ scanner.scan_elements()
+ mapping = scanner.get_elements_mapping()
+ time.sleep(STEP_DELAY_SEC)
+
+ address_bar_index = _find_address_bar(mapping)
+ if address_bar_index is None:
+ logger.error("setup.py: Safari address bar not found in scan")
+ return False
+
+ banner.update("Safari detected. Writing the URL for you, please wait…")
+
+ controller = ControllerService()
+ controller.set_elements(mapping, scanner.application_name)
+ key_combo = KeyComboService()
+
+ controller.click(address_bar_index)
+ time.sleep(STEP_DELAY_SEC)
+
+ controller.canvas_input(TELEGRAM_WEB_URL)
+ time.sleep(STEP_DELAY_SEC)
+
+ key_combo.send("return")
+ return True
+
+
+def run(country_code: str = "", phone: str = "") -> bool:
+ """Guided Telegram-Web pairing.
+
+ Shows a banner, waits for the user to click Next, opens Telegram Web,
+ waits for the user to log in manually + click Next, then closes.
+
+ country_code and phone are accepted but ignored — kept only so the
+ pre-existing /api/telegram/connect callsite signature still works.
+ """
+ banner = StatusBanner()
+ banner.show()
+ try:
+ banner.update("Let's get you set up with Telegram. Please click Next.")
+ banner.wait_for_next()
+
+ if not _open_telegram_in_safari(banner):
+ banner.update("Failed to open Telegram. Close this banner and try again.")
+ banner.wait_for_next(timeout=15)
+ return False
+
+ banner.update("Please log in to Telegram, then click Next")
+ banner.wait_for_next()
+
+ banner.update(
+ "Now search for @BotFather in Telegram and open the chat. "
+ "Click Next when you're there."
+ )
+ banner.wait_for_next()
+
+ banner.update("How do you want to set up the bot?")
+ choice = banner.wait_for_choice("Fresh setup", "Token already generated")
+
+ if choice == "left":
+ banner.update(
+ "In @BotFather, send these one at a time: /newbot → AutoUse → "
+ "a unique bot name. BotFather will reply with your token. "
+ "Click Next when you have it."
+ )
+ banner.wait_for_next()
+
+ banner.update("Paste your BotFather token below and click Save.")
+ token = banner.wait_for_input(save_label="Save")
+ if not token:
+ return False # Cocoa-unavailable fallback; banner never appeared
+
+ _set_key_in_file(_API_KEY_FILE, "TELEGRAM_BOT_TOKEN", token.strip())
+
+ banner.update("Saved. Restarting AutoUse to start the bot…")
+ # Give the message time to stream out + a beat for the user to read
+ # it, then hard-exit the whole process. The user's next `python
+ # app.py` boot picks up the fresh TELEGRAM_BOT_TOKEN and the bot
+ # comes online with the saved owner chat. os._exit skips atexit /
+ # finally cleanup, which is what we want — Cocoa will tear down
+ # the banner + windows as the process dies.
+ time.sleep(3)
+ banner.close()
+ os._exit(0)
+ finally:
+ banner.close()
diff --git a/Auto_Use/macOS_use/remote_connection/telegram/view.py b/Auto_Use/macOS_use/remote_connection/telegram/view.py
new file mode 100644
index 0000000..5eb5758
--- /dev/null
+++ b/Auto_Use/macOS_use/remote_connection/telegram/view.py
@@ -0,0 +1,157 @@
+# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# If you build on this project, please keep this header and credit
+# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
+# A small attribution goes a long way toward a healthy open-source
+# community — thank you for contributing.
+
+"""Flask Blueprint for the macOS Telegram surface.
+
+Lives in the telegram folder so all Telegram-related code stays here — app.py
+just imports `telegram_bp` and calls `app.register_blueprint(...)`. Routes:
+
+ GET /api/telegram/status → {connected, bot_username?}
+ POST /api/telegram/connect → kicks off the Phase 4 guided walkthrough
+ POST /api/telegram/disconnect → clears the persisted token
+
+All token lookups read ONLY from api_key.txt. We deliberately do NOT consult
+.env — that file is app.py's general env-loading concern; the Telegram bot
+treats api_key.txt as its single source of truth.
+"""
+import json
+import logging
+import threading
+import urllib.request
+
+from flask import Blueprint, jsonify
+
+logger = logging.getLogger(__name__)
+
+telegram_bp = Blueprint("telegram_macos", __name__)
+
+# Single source of truth for the key-file path — service.py resolves it in a
+# compiled-build-aware way (next to the executable when frozen). Importing it
+# here keeps the picker/status/disconnect routes pointed at the same file the
+# bot and the regular agent read.
+from .service import _API_KEY_FILE
+
+_bot_username_cache: str | None = None
+
+
+def _read_token() -> str | None:
+ """Pull TELEGRAM_BOT_TOKEN out of api_key.txt. Returns None if missing or
+ empty. Does NOT consult .env or env vars on purpose."""
+ if not _API_KEY_FILE.exists():
+ return None
+ try:
+ with open(_API_KEY_FILE, "r", encoding="utf-8") as f:
+ for line in f:
+ stripped = line.strip()
+ if stripped.startswith("TELEGRAM_BOT_TOKEN="):
+ val = stripped.partition("=")[2].strip()
+ return val or None
+ except Exception:
+ logger.warning("could not read %s", _API_KEY_FILE)
+ return None
+
+
+def _set_token(value: str) -> None:
+ """Write/clear TELEGRAM_BOT_TOKEN= in api_key.txt, preserving every other
+ line (incl. empty-value placeholders the AutoUse UI relies on)."""
+ lines = []
+ found = False
+ if _API_KEY_FILE.exists():
+ try:
+ with open(_API_KEY_FILE, "r", encoding="utf-8") as f:
+ for raw in f:
+ if raw.strip().startswith("TELEGRAM_BOT_TOKEN="):
+ lines.append(f"TELEGRAM_BOT_TOKEN={value}\n")
+ found = True
+ else:
+ lines.append(raw if raw.endswith("\n") else raw + "\n")
+ except Exception:
+ logger.warning("could not read %s while updating token", _API_KEY_FILE)
+ return
+ if not found:
+ lines.append(f"TELEGRAM_BOT_TOKEN={value}\n")
+ try:
+ _API_KEY_FILE.parent.mkdir(parents=True, exist_ok=True)
+ with open(_API_KEY_FILE, "w", encoding="utf-8") as f:
+ f.writelines(lines)
+ except Exception:
+ logger.warning("could not write %s", _API_KEY_FILE)
+
+
+def _fetch_bot_username(token: str) -> str | None:
+ """One-shot call to Telegram's getMe — used by /status so the panel can
+ show '@your_bot' instead of just 'connected'."""
+ try:
+ resp = urllib.request.urlopen(
+ f"https://api.telegram.org/bot{token}/getMe", timeout=5
+ )
+ data = json.loads(resp.read())
+ if data.get("ok"):
+ return data["result"].get("username", "") or None
+ except Exception:
+ pass
+ return None
+
+
+# ── routes ──────────────────────────────────────────────────────────────────
+
+@telegram_bp.route("/api/telegram/status", methods=["GET"])
+def telegram_status():
+ """Frontend uses this to decide which Remote Connection panel state to
+ show. If a token is present in api_key.txt → 'connected', and the panel
+ flips to the @bot_username + Disconnect view (Connect button is hidden).
+ Cached so we don't hit Telegram's API on every page load."""
+ global _bot_username_cache
+ token = _read_token()
+ if not token:
+ _bot_username_cache = None
+ return jsonify({"connected": False})
+ if _bot_username_cache is None:
+ _bot_username_cache = _fetch_bot_username(token) or ""
+ return jsonify({
+ "connected": True,
+ "bot_username": _bot_username_cache,
+ })
+
+
+@telegram_bp.route("/api/telegram/connect", methods=["POST"])
+def telegram_connect():
+ """Kick off the Phase 4 guided walkthrough (Safari → web.telegram.org →
+ user logs in manually, paced by the floating banner). Returns immediately;
+ the real work runs on a daemon thread since it blocks on user clicks."""
+ try:
+ from Auto_Use.macOS_use.remote_connection.telegram.setup import (
+ run as run_telegram_setup,
+ )
+ threading.Thread(target=run_telegram_setup, daemon=True).start()
+ return jsonify({"status": "started"})
+ except Exception as e:
+ logger.exception("telegram_connect failed")
+ return jsonify({"status": "error", "message": str(e)}), 500
+
+
+@telegram_bp.route("/api/telegram/disconnect", methods=["POST"])
+def telegram_disconnect():
+ """Clear the persisted token + the cached @bot_username. The polling
+ thread already running keeps polling until the next app restart (soft
+ disconnect) — clean shutdown of the bot loop is a future enhancement."""
+ global _bot_username_cache
+ _set_token("")
+ _bot_username_cache = None
+ return jsonify({"status": "disconnected"})
diff --git a/Auto_Use/macOS_use/tree/element.py b/Auto_Use/macOS_use/tree/element.py
index 09c54c6..8af9a88 100644
--- a/Auto_Use/macOS_use/tree/element.py
+++ b/Auto_Use/macOS_use/tree/element.py
@@ -58,6 +58,7 @@
from ApplicationServices import (
AXUIElementCreateSystemWide, AXUIElementCreateApplication,
AXUIElementCopyAttributeValue, AXUIElementSetAttributeValue,
+ AXUIElementGetPid,
AXIsProcessTrusted, kAXErrorSuccess,
)
@@ -530,21 +531,47 @@ def _point_in_rect(px, py, rect):
and rect["y"] <= py <= rect["y"] + rect["height"])
-def _ancestor_clipped_visibility(frame, ancestors, screen, window_clip=None):
+def _ancestor_clipped_visibility(frame, ancestors, screen, window_clip=None,
+ scroll_clip=None):
"""Bottom-up visibility check — mirrors Windows _get_clipping_ancestors.
- Returns (visibility_str, visible_rect_dict_or_None)."""
+ Returns (visibility_str, visible_rect_dict_or_None).
+
+ `ancestors` may be a list of frame dicts (legacy callers) or
+ `(frame, role)` tuples; tuple form lets us recognise scroll containers
+ and skip the fixed/sticky safety-net for them.
+
+ `scroll_clip`, when provided, is the innermost scrollable container's
+ viewport rect. Elements outside it are scroll-clipped — strictly hidden,
+ no safety-net.
+ """
visible = dict(frame)
+ if scroll_clip is not None:
+ inter = _rect_intersect(visible, scroll_clip)
+ if inter is None:
+ return "hidden", None
+ visible = inter
+
for anc in ancestors:
if anc is None:
continue
- if anc["width"] < 50 or anc["height"] < 50:
+ if isinstance(anc, tuple):
+ anc_frame, anc_role = anc
+ if anc_frame is None:
+ continue
+ else:
+ anc_frame, anc_role = anc, None
+ if anc_frame["width"] < 50 or anc_frame["height"] < 50:
continue
- inter = _rect_intersect(visible, anc)
+ inter = _rect_intersect(visible, anc_frame)
if inter is None:
- anc_on_screen = _rect_intersect(anc, screen) is not None
- anc_large = anc["width"] >= 100 and anc["height"] >= 100
+ # Scroll containers are authoritative — if the element's frame is
+ # outside the viewport, it really is scrolled out. Don't bypass.
+ if anc_role in CLIP_ROLES:
+ return "hidden", None
+ anc_on_screen = _rect_intersect(anc_frame, screen) is not None
+ anc_large = anc_frame["width"] >= 100 and anc_frame["height"] >= 100
if anc_on_screen and anc_large:
# Safety net for CSS position:fixed / sticky elements —
# their AX parent frames may not encompass them even though
@@ -614,9 +641,17 @@ def walk(element, results, depth, screen, clip=None, parent_frame=None,
if frame and frame["width"] > 0 and frame["height"] > 0:
label = build_label(element, cfg)
if label:
- vis_str, vis_rect = _ancestor_clipped_visibility(frame, ancestors, screen, window_clip)
+ vis_str, vis_rect = _ancestor_clipped_visibility(
+ frame, ancestors, screen, window_clip,
+ scroll_clip=clip)
if vis_str != "hidden":
+ try:
+ err, elem_pid = AXUIElementGetPid(element, None)
+ if err != kAXErrorSuccess:
+ elem_pid = 0
+ except Exception:
+ elem_pid = 0
results.append({
"type": role_str,
"label": label,
@@ -628,9 +663,12 @@ def walk(element, results, depth, screen, clip=None, parent_frame=None,
"visibility": vis_str,
"visible_rect_raw": vis_rect,
"ax_element": element,
+ "_window_frame": window_clip,
+ "_pid": elem_pid,
})
- child_ancestors = ancestors + [my_frame] if my_frame else ancestors
+ my_entry = (my_frame, role_str) if my_frame else None
+ child_ancestors = ancestors + [my_entry] if my_entry else ancestors
children = ax_attr(element, "AXChildren")
if children:
try:
@@ -698,17 +736,145 @@ def _find_topmost_app_on_screen(screen):
return topmost, window_stack
-def _is_occluded(element, allowed_pids, window_stack):
- """Check if element is behind another app's window."""
- cx = element["x"] + element["width"] / 2
- cy = element["y"] + element["height"] / 2
- for win in window_stack:
- if _point_in_rect(cx, cy, win["frame"]):
- if win["pid"] in allowed_pids:
- return False
- else:
- return True
- return False
+def _build_full_occluder_stack(screen):
+ """Front-to-back list of every on-screen window (all layers).
+
+ Each entry: {pid, name, frame, window_id, layer}. Skips Window Server
+ and Dock; skips off-screen and tiny windows."""
+ flags = kCGWindowListOptionOnScreenOnly | kCGWindowListExcludeDesktopElements
+ wins = CGWindowListCopyWindowInfo(flags, kCGNullWindowID)
+ skip_owners = {"Window Server", "Dock"}
+ stack = []
+ if not wins:
+ return stack
+ for w in wins:
+ owner = w.get("kCGWindowOwnerName", "")
+ if owner in skip_owners:
+ continue
+ bounds = w.get("kCGWindowBounds")
+ if not bounds:
+ continue
+ ww = bounds.get("Width", 0)
+ wh = bounds.get("Height", 0)
+ if ww < 50 or wh < 50:
+ continue
+ wx = bounds.get("X", 0)
+ wy = bounds.get("Y", 0)
+ if not _rect_overlaps(wx, wy, ww, wh, screen):
+ continue
+ stack.append({
+ "pid": w.get("kCGWindowOwnerPID", 0),
+ "name": owner,
+ "frame": {"x": wx, "y": wy, "width": ww, "height": wh},
+ "window_id": w.get("kCGWindowNumber", 0),
+ "layer": w.get("kCGWindowLayer", 0),
+ })
+ return stack
+
+
+def _apply_window_occlusion(results, screen):
+ """Recompute per-element visibility against the real on-screen window
+ z-order. Drops elements whose visible area is effectively zero.
+
+ Elements without a known owning window (e.g. menu-bar walk results that
+ lack `_window_frame`) are left untouched."""
+ full_stack = _build_full_occluder_stack(screen)
+ if not full_stack:
+ return results
+
+ # Cache: (pid, (x, y, w, h)) -> owning index in full_stack
+ owning_cache = {}
+
+ def _owning_index(pid, win_frame):
+ if win_frame is None or not pid:
+ return -1
+ key = (pid, win_frame["x"], win_frame["y"],
+ win_frame["width"], win_frame["height"])
+ if key in owning_cache:
+ return owning_cache[key]
+ best = -1
+ for i, w in enumerate(full_stack):
+ if w["pid"] != pid:
+ continue
+ wf = w["frame"]
+ if (abs(wf["x"] - win_frame["x"]) < 20
+ and abs(wf["y"] - win_frame["y"]) < 20
+ and abs(wf["width"] - win_frame["width"]) < 20
+ and abs(wf["height"] - win_frame["height"]) < 20):
+ best = i
+ break
+ owning_cache[key] = best
+ return best
+
+ out = []
+ for e in results:
+ win_frame = e.get("_window_frame")
+ pid = e.get("_pid")
+ idx = _owning_index(pid, win_frame)
+ if idx < 0:
+ # Unknown owning window (menu-bar walk, dock) — leave as-is.
+ out.append(e)
+ continue
+
+ elem_rect = {"x": e["x"], "y": e["y"],
+ "width": e["width"], "height": e["height"]}
+ occluders = []
+ for w in full_stack[:idx]:
+ if w["window_id"] and w["window_id"] == full_stack[idx].get("window_id"):
+ continue
+ inter = _rect_intersect(elem_rect, w["frame"])
+ if inter is not None:
+ occluders.append(w["frame"])
+
+ if not occluders:
+ out.append(e)
+ continue
+
+ frac = _visible_fraction_after_occluders(elem_rect, occluders)
+ # Combine with walk-time clipping fraction.
+ vr = e.get("visible_rect_raw")
+ if vr:
+ walk_frac = (vr["width"] * vr["height"]) / max(
+ 1, elem_rect["width"] * elem_rect["height"])
+ else:
+ walk_frac = 1.0
+ final = walk_frac * frac
+
+ if final < 0.01:
+ continue # drop fully-occluded
+ if final >= 0.99:
+ e["visibility"] = "full"
+ else:
+ e["visibility"] = f"partial {int(final * 100)}%"
+ out.append(e)
+
+ return out
+
+
+def _visible_fraction_after_occluders(rect, occluder_rects, samples=20):
+ """Return uncovered-area fraction of rect (0.0..1.0) using a grid sample.
+
+ `occluder_rects` is a list of rect dicts that paint on top of `rect`.
+ A grid point is "covered" if it lies inside ANY occluder. Uses
+ samples x samples points (default 400)."""
+ if rect["width"] <= 0 or rect["height"] <= 0:
+ return 0.0
+ if not occluder_rects:
+ return 1.0
+ step_x = rect["width"] / samples
+ step_y = rect["height"] / samples
+ covered = 0
+ total = samples * samples
+ for i in range(samples):
+ px = rect["x"] + (i + 0.5) * step_x
+ for j in range(samples):
+ py = rect["y"] + (j + 0.5) * step_y
+ for occ in occluder_rects:
+ if (occ["x"] <= px <= occ["x"] + occ["width"]
+ and occ["y"] <= py <= occ["y"] + occ["height"]):
+ covered += 1
+ break
+ return (total - covered) / total
def _scan_menu_bar(screen, top_pid):
@@ -919,7 +1085,11 @@ def extract_all(screen):
walk(win, results, 0, screen, clip=screen_clip, is_browser=is_browser, window_clip=screen_clip)
if window_stack:
- # Find overlay/dialog windows from any process that overlap the topmost app
+ # Find overlay/dialog windows from other processes that actually
+ # float ABOVE the topmost app (Spotlight, system popovers, sheets).
+ # Walk full window list front-to-back; stop when we reach the
+ # frontmost app's first layer-0 window — anything after is behind
+ # it and must be excluded.
dialog_pids = set()
skip_dialog_owners = {"Window Server", "Dock"}
top_frame = top["frame"]
@@ -928,8 +1098,11 @@ def extract_all(screen):
if all_wins:
for w in all_wins:
wpid = w.get("kCGWindowOwnerPID", 0)
+ layer = w.get("kCGWindowLayer", -1)
+ if wpid == top["pid"] and layer == 0:
+ break # reached frontmost app's window; stop
if wpid == top["pid"]:
- continue # Skip topmost app's own windows
+ continue # frontmost app's own higher-layer windows already walked
owner = w.get("kCGWindowOwnerName", "")
if owner in skip_dialog_owners:
continue
@@ -955,10 +1128,6 @@ def extract_all(screen):
if dwf and _on_screen(dwf, screen):
walk(dwin, results, 0, screen, clip=dwf, window_clip=dwf)
- allowed_pids = {top["pid"]} | dialog_pids
- results = [e for e in results
- if not _is_occluded(e, allowed_pids, window_stack)]
-
else:
finder = find_app("com.apple.finder")
if finder:
@@ -1053,6 +1222,12 @@ def extract_all(screen):
if dock:
walk(AXUIElementCreateApplication(dock.processIdentifier()), results, 0, screen)
+ # ----- Real on-screen occlusion pass -----
+ # Recompute each element's visibility against every window that paints
+ # on top of its owning window. Drop fully-covered elements so the agent
+ # never receives a click index for a coordinate it can't actually hit.
+ results = _apply_window_occlusion(results, screen)
+
# Deduplicate
seen = set()
unique = []
@@ -1063,6 +1238,11 @@ def extract_all(screen):
unique.append(e)
results = unique
+ # Strip internal helper keys before returning so they don't leak.
+ for e in results:
+ e.pop("_window_frame", None)
+ e.pop("_pid", None)
+
return app_info, menu_items, results
diff --git a/Auto_Use/windows_use/remote_connection/telegram/__init__.py b/Auto_Use/windows_use/remote_connection/telegram/__init__.py
new file mode 100644
index 0000000..556670d
--- /dev/null
+++ b/Auto_Use/windows_use/remote_connection/telegram/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# If you build on this project, please keep this header and credit
+# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
+# A small attribution goes a long way toward a healthy open-source
+# community — thank you for contributing.
diff --git a/Auto_Use/windows_use/remote_connection/telegram/banner.py b/Auto_Use/windows_use/remote_connection/telegram/banner.py
new file mode 100644
index 0000000..37041a3
--- /dev/null
+++ b/Auto_Use/windows_use/remote_connection/telegram/banner.py
@@ -0,0 +1,1149 @@
+# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# If you build on this project, please keep this header and credit
+# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
+# A small attribution goes a long way toward a healthy open-source
+# community — thank you for contributing.
+
+"""Banner — both the StatusBanner wrapper used by callers AND the
+subprocess that hosts the pywebview pill.
+
+The same module is invoked two ways:
+
+ 1. **Imported** from setup.py / service.py — exposes the
+ `StatusBanner` class that drives the wizard. Side-effect-free:
+ pywebview is NOT imported at module load, only inside
+ `_run_subprocess_banner` which the parent never calls.
+
+ 2. **Run as `python -m …banner`** (spawned by `StatusBanner.show()`
+ via `subprocess.Popen`) — falls through `if __name__ == "__main__"`
+ into `_run_subprocess_banner`, which boots pywebview and parks on
+ `webview.start()`. Reads JSON commands from stdin, emits JSON
+ events on stdout.
+
+Why two roles, one file? Running pywebview's second window from a
+worker thread inside the already-running AutoUse process kept landing
+the pill off-screen on DPI-scaled displays. A fresh Python interpreter
+(the subprocess) was the only way to dodge that DPI confusion —
+`banner_test.py` standalone works perfectly on the same machine. The
+subprocess body used to live in a separate `banner_proc.py` but it
+doesn't need to: a single module's `__main__` guard does the same job
+with one fewer file to keep in sync.
+
+Wire protocol (one JSON message per line):
+
+ → stdin {"cmd": "MSG"|"SHOW_NEXT"|"HIDE_NEXT"|"SHOW_CHOICE"|
+ "SHOW_INPUT"|"CLEAR"|"CLOSE", ...}
+ ← stdout {"event": "READY"|"NEXT"|"CHOICE"|"SAVE"|"CLOSED", ...}
+"""
+import ctypes
+import datetime
+import json
+import logging
+import os
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import uuid
+from queue import Queue, Empty
+
+logger = logging.getLogger(__name__)
+
+# True when this module is running inside the Nuitka-compiled AutoUse.exe
+# (i.e. sys.executable is the exe, not a Python interpreter). In that case
+# `python -m …banner` is meaningless — the binary has no -m loader — so
+# StatusBanner.show() must re-exec AutoUse.exe with --banner-mode, which
+# app.py's main() picks up and routes to _run_subprocess_banner() directly.
+# Mirrors the detection in app.py:71 and the same pattern already used for
+# --minion-mode in Auto_Use/windows_use/controller/view.py:697.
+_IS_COMPILED = getattr(sys, "frozen", False) or "__compiled__" in globals()
+
+
+# ── Pill geometry ─────────────────────────────────────────────────────────
+
+PILL_WIDTH = 580
+PILL_HEIGHT = 72
+# COMPACT_SIZE is the target square dimension for the small "telegram
+# task running" indicator pill. WinForms imposes an OS-level minimum
+# width (~SM_CXMINTRACK = 132+ logical pixels) on freshly created Forms,
+# which stretches a smaller create_window request into a wide pill — so
+# we always force the final size via window.resize() after the form is
+# alive (see _on_shown). 80 is the sweet spot: small enough to read as
+# an indicator, big enough to hold the 42 px orb with breathing room.
+COMPACT_SIZE = 80
+SCREEN_MARGIN = 20
+
+
+# ── Win32 region clip + click-through (subprocess-side, but ctypes is
+# stdlib so importing it at the top costs nothing for the parent) ──
+
+class _RECT(ctypes.Structure):
+ _fields_ = [
+ ("left", ctypes.c_long),
+ ("top", ctypes.c_long),
+ ("right", ctypes.c_long),
+ ("bottom", ctypes.c_long),
+ ]
+
+
+def _stderr(msg: str) -> None:
+ """Loud print to whichever stderr we're attached to. Used both by
+ the parent (for `[banner] spawned subprocess pid=…` etc.) and by
+ the subprocess (which inherits the parent's stderr so the messages
+ land in the same terminal)."""
+ print(f"[banner] {msg}", file=sys.stderr, flush=True)
+
+
+def _emit(event: str, **kwargs) -> None:
+ """Subprocess → parent: write a JSON event to stdout (one line)."""
+ try:
+ payload = {"event": event, **kwargs}
+ sys.stdout.write(json.dumps(payload) + "\n")
+ sys.stdout.flush()
+ except Exception:
+ pass
+
+
+# File-based event log for the subprocess. Lives at
+# %LOCALAPPDATA%\AutoUse\banner_debug.log so it survives whatever happens
+# to the subprocess's stdio. We log subprocess start, on_shown, events.closing,
+# events.closed, exceptions, and webview.start() return — enough to point at
+# the exact proximate cause if the pill ever vanishes mid-flow again. Best
+# effort: any failure to write is swallowed.
+def _log(msg: str) -> None:
+ try:
+ base = os.environ.get("LOCALAPPDATA") or os.path.expanduser("~")
+ path = os.path.join(base, "AutoUse", "banner_debug.log")
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ with open(path, "a", encoding="utf-8") as f:
+ f.write(
+ f"[{datetime.datetime.now().isoformat()}] pid={os.getpid()} {msg}\n"
+ )
+ except Exception:
+ pass
+
+
+def _js_escape(text: str) -> str:
+ return (
+ (text or "")
+ .replace("\\", "\\\\")
+ .replace("'", "\\'")
+ .replace("\n", "\\n")
+ .replace("\r", "")
+ )
+
+
+def _find_hwnd(title: str) -> int:
+ """Locate the OS HWND for our pywebview window by title. Polls
+ briefly because events.shown can fire one frame before the OS lets
+ FindWindowW see the new window."""
+ user32 = ctypes.windll.user32
+ hwnd = 0
+ for _ in range(40):
+ hwnd = user32.FindWindowW(None, title)
+ if hwnd:
+ return hwnd
+ time.sleep(0.025)
+ return 0
+
+
+def _make_click_through(title: str) -> None:
+ """Make the window pass mouse clicks to whatever is underneath it.
+
+ Achieved by adding WS_EX_LAYERED | WS_EX_TRANSPARENT to the
+ extended window style. SetLayeredWindowAttributes with alpha=255
+ is required after the LAYERED flag goes on or Windows treats the
+ window as fully invisible — we want fully visible but unclickable.
+
+ Used by the compact "telegram task in progress" indicator pill so
+ it never blocks the user from clicking the desktop / other apps
+ beneath it; the pill is a passive visual cue, never interactive.
+ Matches macOS's `setIgnoresMouseEvents_(True)` on the compact
+ NSPanel."""
+ user32 = ctypes.windll.user32
+ hwnd = _find_hwnd(title)
+ if not hwnd:
+ return
+ GWL_EXSTYLE = -20
+ WS_EX_LAYERED = 0x00080000
+ WS_EX_TRANSPARENT = 0x00000020
+ LWA_ALPHA = 0x00000002
+ style = user32.GetWindowLongW(hwnd, GWL_EXSTYLE)
+ user32.SetWindowLongW(
+ hwnd, GWL_EXSTYLE, style | WS_EX_LAYERED | WS_EX_TRANSPARENT
+ )
+ # WS_EX_LAYERED windows render nothing until SetLayeredWindowAttributes
+ # (or UpdateLayeredWindow) is called. alpha=255 → fully opaque so the
+ # orb still paints normally; only mouse input is what we want to drop.
+ user32.SetLayeredWindowAttributes(hwnd, 0, 255, LWA_ALPHA)
+
+
+def _apply_rounded_region(title: str) -> None:
+ """Clip the window with the given title into a stadium pill.
+
+ Uses FindWindowW on the unique title to locate the HWND,
+ GetWindowRect for the actual DPI-aware size, then SetWindowRgn for
+ the clip. Polls briefly because events.shown can fire one frame
+ before the OS lets FindWindowW see the new window."""
+ user32 = ctypes.windll.user32
+ gdi32 = ctypes.windll.gdi32
+
+ hwnd = 0
+ for _ in range(40):
+ hwnd = user32.FindWindowW(None, title)
+ if hwnd:
+ break
+ time.sleep(0.025)
+ if not hwnd:
+ return
+
+ rect = _RECT()
+ user32.GetWindowRect(hwnd, ctypes.byref(rect))
+ w = rect.right - rect.left
+ h = rect.bottom - rect.top
+ if w <= 0 or h <= 0:
+ return
+
+ # Pill: full-height end caps via corner ellipse = h × h.
+ rgn = gdi32.CreateRoundRectRgn(0, 0, w + 1, h + 1, h, h)
+ user32.SetWindowRgn(hwnd, rgn, True)
+
+
+# ── HTML (subprocess-side only — parent never touches these strings) ──
+
+BANNER_HTML = r"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Starting…
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+
+COMPACT_HTML = r"""
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+
+
+# ── JS↔Python bridge (subprocess-side only) ──────────────────────────────
+
+
+# Hard cap on pill height so a freakishly long message can't push it
+# into a wall-of-text rectangle. Matches the macOS banner's MAX_H.
+_MAX_PILL_HEIGHT = 200
+
+
+class _BannerState:
+ """Mutable state shared between the resize-handler and the rest of
+ the subprocess body.
+
+ Deliberately NOT used as `js_api` — see _make_js_handlers."""
+
+ def __init__(self, title: str, width: int, min_h: int, compact: bool):
+ self.window = None
+ self.title = title
+ self.width = width
+ self.min_h = min_h
+ self.compact = compact
+ self.last_h = min_h
+
+
+def _make_js_handlers(state: _BannerState):
+ """Return JS-exposed handlers as a 4-tuple of plain local functions.
+
+ We register these via `window.expose(*funcs)` instead of the old
+ `js_api=_Api(...)` pattern because pywebview's util.py:get_functions
+ filters attributes via `inspect.ismethod(attr)` — which returns
+ False for bound methods of Nuitka-compiled classes. In the
+ compiled binary that silently drops every method on _Api, so the
+ JS-side `window.pywebview.api.next_clicked()` resolves to nothing
+ and clicks become no-ops. `window.expose()` stores functions
+ directly in `window._functions`, which the dispatcher checks
+ BEFORE falling back to js_api reflection."""
+
+ def next_clicked(_value=None):
+ _emit("NEXT")
+ return None
+
+ def choice_clicked(value=None):
+ _emit("CHOICE", value=str(value) if value is not None else "left")
+ return None
+
+ def save_clicked(value=None):
+ _emit("SAVE", value=value.strip() if isinstance(value, str) else "")
+ return None
+
+ def height_changed(h=0):
+ """Resize the window to fit the reported body height, then
+ re-clip the (possibly taller) window into a stadium so the end
+ caps follow the new height. No-op for the compact pill which
+ has no scrollable content and a constant 80×80 size."""
+ if state.compact or state.window is None:
+ return None
+ try:
+ target = max(state.min_h, min(_MAX_PILL_HEIGHT, int(h)))
+ if target == state.last_h:
+ return None
+ state.last_h = target
+ state.window.resize(state.width, target)
+ # SetWindowRgn's saved region is anchored to the OLD height,
+ # so without re-clipping the bottom of the now-taller window
+ # would render as a hard rectangle below the pill ends.
+ _apply_rounded_region(state.title)
+ except Exception:
+ pass
+ return None
+
+ return next_clicked, choice_clicked, save_clicked, height_changed
+
+
+# ── stdin reader thread (subprocess-side only) ───────────────────────────
+
+
+def _stdin_reader(window) -> None:
+ """Loop reading JSON commands from stdin and dispatching to the window.
+
+ Runs on its own thread so we don't block the pywebview GUI thread."""
+ _log("stdin_reader: thread started")
+ try:
+ for line in sys.stdin:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ msg = json.loads(line)
+ except Exception:
+ _log(f"stdin_reader: skip unparseable line {line!r}")
+ continue
+ cmd = msg.get("cmd")
+ _log(f"stdin_reader: cmd={cmd!r}")
+ try:
+ if cmd == "MSG":
+ esc = _js_escape(msg.get("text", ""))
+ window.evaluate_js(f"if(window.setMsg) setMsg('{esc}');")
+ elif cmd == "SHOW_NEXT":
+ window.evaluate_js("if(window.showNext) showNext();")
+ elif cmd == "HIDE_NEXT":
+ window.evaluate_js("if(window.hideNext) hideNext();")
+ elif cmd == "SHOW_CHOICE":
+ left = _js_escape(msg.get("left", ""))
+ right = _js_escape(msg.get("right", ""))
+ window.evaluate_js(
+ f"if(window.setChoice) setChoice('{left}', '{right}');"
+ )
+ elif cmd == "SHOW_INPUT":
+ label = _js_escape(msg.get("label", "Save"))
+ window.evaluate_js(
+ f"if(window.setInput) setInput('{label}');"
+ )
+ elif cmd == "CLEAR":
+ window.evaluate_js("if(window.clearAll) clearAll();")
+ elif cmd == "CLOSE":
+ _log("stdin_reader: CLOSE received, destroying window")
+ try:
+ window.destroy()
+ except Exception:
+ import traceback
+ _log(
+ "stdin_reader: window.destroy() raised:\n"
+ + traceback.format_exc()
+ )
+ return
+ except Exception:
+ # Window may have been destroyed mid-flight — log and
+ # keep the reader alive so the process exits cleanly.
+ import traceback
+ _log(
+ f"stdin_reader: cmd={cmd!r} dispatch raised:\n"
+ + traceback.format_exc()
+ )
+ except Exception:
+ import traceback
+ _log("stdin_reader: outer loop raised:\n" + traceback.format_exc())
+ _log("stdin_reader: thread exiting (stdin EOF or pipe break)")
+
+
+# ── subprocess entry point ────────────────────────────────────────────────
+
+
+def _run_subprocess_banner() -> None:
+ """Subprocess body. Imports webview lazily so the parent (which only
+ uses StatusBanner) doesn't pay its startup cost when it imports this
+ module.
+
+ Mirrors `banner_test.py` byte-for-byte except for the JSON-stdio
+ protocol that lets the parent drive the wizard state machine."""
+ _log(f"subprocess start (sys.executable={sys.executable!r})")
+ # Top-level guard: any exception escaping the GUI setup or webview.start()
+ # must land in the debug log — otherwise the user sees the pill flash and
+ # vanish with nothing to point at. Each step is also wrapped individually
+ # so we know exactly which one died.
+ try:
+ import webview
+ _log("webview imported")
+
+ compact = "--compact" in sys.argv[1:]
+
+ # Primary-screen width via Win32. GetSystemMetrics(SM_CXSCREEN=0)
+ # returns the DPI-virtualised value in this freshly spawned,
+ # still-DPI-unaware subprocess — identical to what tkinter's
+ # winfo_screenwidth() returned before, without dragging the
+ # tcl/tk runtime into the Nuitka binary (tkinter is listed in
+ # nofollow_third_party in windows_binary_build.py:533 and is
+ # therefore not bundled in the compiled exe).
+ try:
+ screen_w = ctypes.windll.user32.GetSystemMetrics(0) or 1920
+ except Exception:
+ screen_w = 1920
+
+ w = COMPACT_SIZE if compact else PILL_WIDTH
+ h = COMPACT_SIZE if compact else PILL_HEIGHT
+ x = max(0, screen_w - w - SCREEN_MARGIN)
+ y = SCREEN_MARGIN
+ html = COMPACT_HTML if compact else BANNER_HTML
+ title = f"AutoUseBanner_{uuid.uuid4().hex[:8]}"
+ state = _BannerState(title=title, width=w, min_h=h, compact=compact)
+ next_clicked, choice_clicked, save_clicked, height_changed = (
+ _make_js_handlers(state)
+ )
+
+ # No js_api here — methods on a Nuitka-compiled class fail pywebview's
+ # `inspect.ismethod` filter and never get exposed to JS. We register
+ # the handlers via window.expose() below instead.
+ window = webview.create_window(
+ title,
+ html=html,
+ width=w,
+ height=h,
+ min_size=(w, h),
+ x=x,
+ y=y,
+ frameless=True,
+ on_top=True,
+ easy_drag=True,
+ resizable=False,
+ )
+ state.window = window
+ window.expose(next_clicked, choice_clicked, save_clicked, height_changed)
+ _log("window created and handlers exposed")
+
+ def _on_shown():
+ _log("on_shown: entered")
+ # Compact mode: WinForms stretches our small create_window
+ # request to its OS-imposed minimum width (~132+ logical px),
+ # producing a wide pill instead of the tight circle we want.
+ # A programmatic window.resize() AFTER the form is alive
+ # bypasses that minimum — Form.Size setter doesn't go through
+ # the SM_CXMINTRACK clamp the way the initial size does. We
+ # then re-clip the (now smaller, square) window into a circle.
+ if compact:
+ try:
+ window.resize(COMPACT_SIZE, COMPACT_SIZE)
+ # Give WinForms one frame to actually realise the new
+ # rect before _apply_rounded_region reads it — without
+ # this the region clip runs against the old wide-pill
+ # geometry and we lose the circle shape.
+ time.sleep(0.1)
+ except Exception:
+ pass
+ # Clip into a pill (or circle, in compact mode) and emit READY
+ # so the parent's show() unblocks.
+ _apply_rounded_region(title)
+ # Compact indicator is purely visual — drop mouse input so the
+ # user can click the desktop or any window underneath it. Only
+ # applied to compact mode; the standard wizard pill needs
+ # Next / Save / choice clicks to land.
+ if compact:
+ _make_click_through(title)
+ _log("on_shown: about to emit READY")
+ _emit("READY")
+ _log("on_shown: READY emitted")
+ # Spawn the stdin reader once the window is up.
+ threading.Thread(
+ target=_stdin_reader, args=(window,), daemon=True
+ ).start()
+ _log("on_shown: stdin reader thread spawned, exiting handler")
+
+ window.events.shown += _on_shown
+
+ # Lifecycle observability: log if the window starts closing or has been
+ # closed by anything other than our own CLOSE command. `events.closing`
+ # handlers must return a truthy value to allow the close; the tuple-idiom
+ # logs first and then yields True.
+ window.events.closing += lambda: (_log("event: closing"), True)[1]
+ window.events.closed += lambda: _log("event: closed")
+
+ # Give the subprocess's WebView2 environment its own UserDataFolder.
+ # pywebview's default is %APPDATA%\pywebview ([winforms.py:704]) — shared
+ # process-wide. In the compiled exe the parent (main AutoUse window) and
+ # this subprocess are both AutoUse.exe and would otherwise contend on the
+ # same folder, which can cause WebView2 to tear down our renderer process
+ # seconds into operation. A per-PID temp folder is invisible to dev mode
+ # (each python interpreter already has its own folder) and isolates the
+ # banner subprocess cleanly in the binary build.
+ storage_path = os.path.join(
+ tempfile.gettempdir(), f"autouse_banner_{os.getpid()}"
+ )
+ _log(f"webview.start(storage_path={storage_path!r})")
+
+ # webview.start() runs the GUI loop in this subprocess's main thread.
+ # Blocks until window.destroy() — which the CLOSE command triggers.
+ try:
+ webview.start(storage_path=storage_path)
+ _log("webview.start() returned normally")
+ except Exception:
+ import traceback
+ _log("webview.start() raised:\n" + traceback.format_exc())
+ raise
+
+ _emit("CLOSED")
+ _log("subprocess exit (CLOSED emitted)")
+ except Exception:
+ # Catches anything escaping the GUI setup so we have a footprint
+ # in the log instead of just "subprocess vanished".
+ import traceback
+ _log("_run_subprocess_banner crashed:\n" + traceback.format_exc())
+ raise
+
+
+# ── parent-side wrapper ──────────────────────────────────────────────────
+
+
+class StatusBanner:
+ """Drop-in Windows mirror of the macOS Cocoa banner, backed by a
+ subprocess that runs the pywebview pill independently."""
+
+ # Module path the subprocess runs. After merging banner_proc.py
+ # into this file, the subprocess re-executes THIS module with the
+ # `if __name__ == "__main__"` guard firing into
+ # _run_subprocess_banner().
+ _PROC_MODULE = "Auto_Use.windows_use.remote_connection.telegram.banner"
+
+ def __init__(self, compact: bool = False):
+ self._compact = compact
+ self._proc: subprocess.Popen | None = None
+ self._stdout_thread: threading.Thread | None = None
+ self._closed = threading.Event()
+ self._ready = threading.Event()
+ self._next_event = threading.Event()
+ # Distinguishes a real NEXT click from a subprocess-close that also
+ # has to unblock _next_event so waiters don't deadlock. Only the
+ # "NEXT" stdout event flips this to True; close-cleanup leaves it
+ # False so callers can tell the user dismissed the banner.
+ self._next_clicked = False
+ self._choice_q: Queue = Queue()
+ self._input_q: Queue = Queue()
+
+ # ── public API ───────────────────────────────────────────────────────
+
+ def show(self) -> None:
+ if self._proc is not None or self._closed.is_set():
+ return
+
+ # In the Nuitka build, sys.executable is AutoUse.exe — a compiled C
+ # binary with no `-m` module loader. Running it with `-m …banner`
+ # silently re-execs the whole AutoUse app (Flask + main webview +
+ # Telegram bot), giving the user a second main window instead of
+ # the pill. Re-exec AutoUse.exe with --banner-mode so app.py's
+ # main() can route directly to _run_subprocess_banner. In dev
+ # (`python app.py`) sys.executable IS a python interpreter, so
+ # the old -m invocation still works and is preferred — it avoids
+ # the cost of bootstrapping app.py just to reach the banner.
+ # cwd: pin the subprocess to the binary's install dir in the compiled
+ # build so WebView2's native DLL loader resolves WebView2Loader.dll,
+ # WebBrowserInterop.x64.dll, etc. from the install folder regardless
+ # of what cwd the parent inherited (a Start-menu launch leaves cwd
+ # at the user's home dir; a shortcut can leave it anywhere). In dev
+ # mode cwd=None inherits the parent's, which is the repo root —
+ # matches the working behaviour.
+ cwd = None
+ if _IS_COMPILED:
+ exe_dir = os.path.dirname(sys.executable)
+ main_exe = os.path.join(exe_dir, "AutoUse.exe")
+ args = [main_exe, "--banner-mode"]
+ cwd = exe_dir
+ else:
+ args = [sys.executable, "-m", self._PROC_MODULE]
+ if self._compact:
+ args.append("--compact")
+
+ try:
+ self._proc = subprocess.Popen(
+ args,
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ # stderr is left attached so the subprocess can write
+ # diagnostics to our terminal (useful for debugging,
+ # never gets parsed).
+ stderr=None,
+ text=True,
+ bufsize=1, # line-buffered
+ cwd=cwd,
+ # On Windows, hide the extra console window subprocess
+ # would otherwise spawn.
+ creationflags=getattr(subprocess, "CREATE_NO_WINDOW", 0),
+ )
+ _stderr(
+ f"spawned banner subprocess pid={self._proc.pid} "
+ f"compact={self._compact}"
+ )
+ except Exception as e:
+ _stderr(f"banner subprocess spawn failed: {e!r}")
+ self._proc = None
+ return
+
+ self._stdout_thread = threading.Thread(
+ target=self._stdout_reader,
+ daemon=True,
+ name="banner-stdout-reader",
+ )
+ self._stdout_thread.start()
+
+ # Block until the subprocess emits READY (banner is visible).
+ # 15 s ceiling covers a cold Python interpreter start; under
+ # normal conditions READY arrives in well under a second.
+ if not self._ready.wait(timeout=15):
+ _stderr("banner subprocess never emitted READY")
+
+ def update(self, text: str) -> None:
+ if self._compact:
+ return
+ self._send({"cmd": "MSG", "text": text or ""})
+
+ def wait_for_next(self, timeout: float | None = None) -> bool:
+ if self._compact:
+ return True
+ if self._proc is None:
+ return True
+ # Banner already dismissed (subprocess gone) — don't pretend the
+ # user clicked Next. Callers use the False return to short-circuit
+ # the wizard instead of opening Edge / advancing steps.
+ if self._closed.is_set():
+ return False
+ self._next_clicked = False
+ self._next_event.clear()
+ self._send({"cmd": "SHOW_NEXT"})
+ self._next_event.wait(timeout=timeout)
+ self._send({"cmd": "HIDE_NEXT"})
+ return self._next_clicked
+
+ def wait_for_choice(
+ self, left_label: str, right_label: str, timeout=None
+ ):
+ if self._compact or self._proc is None:
+ return None
+ self._drain(self._choice_q)
+ self._send({
+ "cmd": "SHOW_CHOICE",
+ "left": left_label,
+ "right": right_label,
+ })
+ try:
+ value = self._choice_q.get(timeout=timeout if timeout else 600)
+ except Empty:
+ value = None
+ self._send({"cmd": "CLEAR"})
+ return value
+
+ def wait_for_input(self, save_label: str = "Save"):
+ if self._compact or self._proc is None:
+ return None
+ self._drain(self._input_q)
+ self._send({"cmd": "SHOW_INPUT", "label": save_label})
+ try:
+ value = self._input_q.get(timeout=600)
+ except Empty:
+ value = None
+ self._send({"cmd": "CLEAR"})
+ return value
+
+ def close(self) -> None:
+ if self._closed.is_set():
+ return
+ self._closed.set()
+ # Unblock anything still parked on a Queue/Event before we tear
+ # the subprocess down.
+ self._next_event.set()
+ try:
+ self._choice_q.put_nowait(None)
+ except Exception:
+ pass
+ try:
+ self._input_q.put_nowait(None)
+ except Exception:
+ pass
+
+ if self._proc is None:
+ return
+
+ # Ask the subprocess to close gracefully; fall back to terminate.
+ self._send({"cmd": "CLOSE"})
+ try:
+ self._proc.wait(timeout=3)
+ except Exception:
+ try:
+ self._proc.terminate()
+ self._proc.wait(timeout=2)
+ except Exception:
+ try:
+ self._proc.kill()
+ except Exception:
+ pass
+ self._proc = None
+
+ # ── internals ────────────────────────────────────────────────────────
+
+ def _send(self, msg: dict) -> None:
+ """Write a JSON command to the subprocess stdin. Silent on
+ broken-pipe errors so a dead subprocess doesn't crash callers."""
+ if self._proc is None or self._proc.stdin is None:
+ return
+ try:
+ self._proc.stdin.write(json.dumps(msg) + "\n")
+ self._proc.stdin.flush()
+ except Exception:
+ pass
+
+ def _stdout_reader(self) -> None:
+ """Read JSON events from the subprocess and route to local
+ Event / Queue primitives so wait_for_* unblock at the right time."""
+ if self._proc is None or self._proc.stdout is None:
+ return
+ for line in self._proc.stdout:
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ msg = json.loads(line)
+ except Exception:
+ continue
+ event = msg.get("event")
+ if event == "READY":
+ _stderr("banner subprocess READY — pill visible")
+ self._ready.set()
+ elif event == "NEXT":
+ # Flag must be set BEFORE the Event so a waiter that wakes
+ # on _next_event.wait() reads the True value, not the
+ # default False left by the close-cleanup path below.
+ self._next_clicked = True
+ self._next_event.set()
+ elif event == "CHOICE":
+ self._choice_q.put(msg.get("value", "left"))
+ elif event == "SAVE":
+ self._input_q.put(msg.get("value", ""))
+ elif event == "CLOSED":
+ _stderr("banner subprocess CLOSED")
+ break
+
+ # Subprocess exited (whether via CLOSED or pipe break). Unblock
+ # any pending waiters so callers don't deadlock.
+ self._closed.set()
+ self._ready.set()
+ self._next_event.set()
+ try:
+ self._choice_q.put_nowait(None)
+ except Exception:
+ pass
+ try:
+ self._input_q.put_nowait(None)
+ except Exception:
+ pass
+
+ @staticmethod
+ def _drain(q: Queue) -> None:
+ try:
+ while True:
+ q.get_nowait()
+ except Empty:
+ pass
+
+
+# ── module entry: run as subprocess if invoked via `python -m …banner` ──
+
+if __name__ == "__main__":
+ _run_subprocess_banner()
diff --git a/Auto_Use/windows_use/remote_connection/telegram/pair.html b/Auto_Use/windows_use/remote_connection/telegram/pair.html
deleted file mode 100644
index cc7b159..0000000
--- a/Auto_Use/windows_use/remote_connection/telegram/pair.html
+++ /dev/null
@@ -1,220 +0,0 @@
-
-
-
-
-
-Auto Use — Pair
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/Auto_Use/windows_use/remote_connection/telegram/service.py b/Auto_Use/windows_use/remote_connection/telegram/service.py
index 5d0f650..55e2a5c 100644
--- a/Auto_Use/windows_use/remote_connection/telegram/service.py
+++ b/Auto_Use/windows_use/remote_connection/telegram/service.py
@@ -17,11 +17,34 @@
# A small attribution goes a long way toward a healthy open-source
# community — thank you for contributing.
+"""Telegram → AgentService bridge with a guided provider/model picker.
+
+Runs as a standalone process (not mounted into Flask). On the first message
+the bot asks you to pick a provider (limited to providers with a non-empty
+key in api_key.txt / .env), then a model (from the same MODEL_MAPPINGS the
+AutoUse frontend uses). Subsequent messages are dispatched as tasks to the
+agent with that provider/model. Picked provider/model persist for the whole
+chat session until you `/reset`.
+
+Token lookup order (first non-empty wins):
+ 1. TELEGRAM_BOT_TOKEN env var
+ 2. .env at the project root
+ 3. Auto_Use/api_key/api_key.txt
+
+Setup:
+ 1. @BotFather → /newbot → copy token.
+ 2. Paste it into .env OR api_key.txt as TELEGRAM_BOT_TOKEN=…
+ 3. Make sure at least one provider key (e.g. OPENROUTER_API_KEY=…) is set.
+ 4. python -m Auto_Use.windows_use.remote_connection.telegram.service
+ 5. On phone: open Telegram, find your bot, send any message.
+"""
import asyncio
-import threading
+import datetime
+import importlib
import logging
+import sys
+import threading
from pathlib import Path
-from typing import Optional
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import (
@@ -30,298 +53,798 @@
MessageHandler,
CallbackQueryHandler,
filters,
- ContextTypes,
)
logger = logging.getLogger(__name__)
-# AgentService is imported lazily inside _run_agent so this module (and the Telegram
-# polling thread) can start without loading tree/element → skimage until a task runs.
-# service.py -> telegram -> remote_connection -> windows_use -> Auto_Use / api_key / api_key.txt
-API_KEY_FILE = Path(__file__).parent.parent.parent.parent / "api_key" / "api_key.txt"
-SCRATCHPAD_PATH = Path(__file__).parent.parent / "scratchpad" / "milestone" / "milestone.md"
+# The Telegram surface treats api_key.txt as its single source of truth — we
+# deliberately do NOT consult .env or env vars here. .env is app.py's general
+# env-loading concern; keeping the bot self-contained against api_key.txt
+# avoids two-files-of-record confusion.
+#
+# Resolve api_key.txt the same way app.py's get_auto_use_path() does: in a
+# compiled/frozen build __file__ points INSIDE the bundle, so the parents[4]
+# walk would miss the editable api_key.txt that lives next to the executable
+# (the one the Settings panel and the regular agent use). Fall back to the
+# source-tree path in dev (python app.py).
+_IS_COMPILED = getattr(sys, "frozen", False) or "__compiled__" in globals()
+if _IS_COMPILED:
+ _API_KEY_FILE = Path(sys.executable).parent / "Auto_Use" / "api_key" / "api_key.txt"
+else:
+ # service.py → telegram → remote_connection → windows_use → Auto_Use → repo root
+ _API_KEY_FILE = (
+ Path(__file__).resolve().parents[4] / "Auto_Use" / "api_key" / "api_key.txt"
+ )
+
+# Agent writes per-step "milestone" lines here. We tail this file during a
+# task and forward each new line back to the user's Telegram chat so they
+# see the agent's progress in real time.
+SCRATCHPAD_PATH = (
+ Path(__file__).resolve().parents[2] / "scratchpad" / "milestone" / "milestone.md"
+)
+SCRATCHPAD_POLL_SEC = 2.0
+MAX_TG_MSG_LEN = 4000 # Telegram caps at 4096; leave headroom for safety
+# Provider id → API-key name in the KV files. Same mapping the macOS side
+# uses ([macOS_use/remote_connection/telegram/service.py:78-85]).
PROVIDER_KEY_MAP = {
- 'openrouter': 'OPENROUTER_API_KEY',
- 'groq': 'GROQ_API_KEY',
- 'openai': 'OPENAI_API_KEY',
- 'anthropic': 'ANTHROPIC_API_KEY',
- 'google': 'GOOGLE_API_KEY',
- 'perplexity': 'PERPLEXITY_API_KEY',
+ "openrouter": "OPENROUTER_API_KEY",
+ "groq": "GROQ_API_KEY",
+ "openai": "OPENAI_API_KEY",
+ "anthropic": "ANTHROPIC_API_KEY",
+ "google": "GOOGLE_API_KEY",
+ "perplexity": "PERPLEXITY_API_KEY",
}
-def _read_api_keys() -> dict:
- """Read api_key.txt and return dict of key_name -> value."""
- keys = {}
- if API_KEY_FILE.exists():
+# ── file helpers ─────────────────────────────────────────────────────────────
+
+def _read_all_keys(path: Path) -> dict:
+ """Parse a simple KEY=VALUE file (one per line) into a dict. Skips empty
+ values and lines starting with '#'."""
+ out = {}
+ if not path.exists():
+ return out
+ try:
+ with open(path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line or line.startswith("#") or "=" not in line:
+ continue
+ k, _, v = line.partition("=")
+ k, v = k.strip(), v.strip()
+ if v:
+ out[k] = v
+ except Exception:
+ pass
+ return out
+
+
+def _resolve_token() -> str | None:
+ """Read TELEGRAM_BOT_TOKEN from api_key.txt only. .env and env vars are
+ intentionally ignored — see header comment."""
+ return _read_all_keys(_API_KEY_FILE).get("TELEGRAM_BOT_TOKEN")
+
+
+def _get_available_providers() -> list:
+ """Providers with a non-empty key in api_key.txt only."""
+ keys = _read_all_keys(_API_KEY_FILE)
+ return [
+ {"id": pid, "key": keys[kname]}
+ for pid, kname in PROVIDER_KEY_MAP.items()
+ if keys.get(kname)
+ ]
+
+
+def _set_key_in_file(path: Path, key: str, value: str) -> None:
+ """Write/update KEY=value in a KV file, preserving every other line.
+
+ Unlike a naive read-all-and-write-back-with-_read_all_keys, this keeps
+ empty-value placeholder lines (e.g. GROQ_API_KEY=) intact — the AutoUse
+ UI relies on those for its provider list rendering.
+ """
+ lines = []
+ found = False
+ if path.exists():
try:
- with open(API_KEY_FILE, 'r', encoding='utf-8') as f:
- for line in f:
- line = line.strip()
- if '=' in line:
- name, _, value = line.partition('=')
- keys[name] = value
+ with open(path, "r", encoding="utf-8") as f:
+ for raw in f:
+ stripped = raw.strip()
+ if stripped.startswith(f"{key}="):
+ lines.append(f"{key}={value}\n")
+ found = True
+ else:
+ lines.append(raw if raw.endswith("\n") else raw + "\n")
except Exception:
- pass
- return keys
-
-
-def _get_available_providers() -> list[dict]:
- """Return providers that have a non-empty API key in api_key.txt."""
- keys = _read_api_keys()
- available = []
- for provider_id, key_name in PROVIDER_KEY_MAP.items():
- if keys.get(key_name, '').strip():
- available.append({'id': provider_id, 'key': keys[key_name]})
- return available
-
-
-def _get_models_for_provider(provider_id: str) -> list[dict]:
- """Import the view module for a provider and return its non-hidden models."""
- view_modules = {
- 'openrouter': 'Auto_Use.windows_use.llm_provider.openrouter.view',
- 'groq': 'Auto_Use.windows_use.llm_provider.groq.view',
- 'openai': 'Auto_Use.windows_use.llm_provider.openai.view',
- 'anthropic': 'Auto_Use.windows_use.llm_provider.anthropic.view',
- 'google': 'Auto_Use.windows_use.llm_provider.google.view',
- 'perplexity': 'Auto_Use.windows_use.llm_provider.perplexity.view',
- }
- module_path = view_modules.get(provider_id)
- if not module_path:
- return []
+ logger.warning("failed to read %s while updating %s", path, key)
+ return
+ if not found:
+ lines.append(f"{key}={value}\n")
+ try:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with open(path, "w", encoding="utf-8") as f:
+ f.writelines(lines)
+ except Exception:
+ logger.warning("failed to write %s", path)
+
+
+def _resolve_owner_chat_id() -> int | None:
+ """Owner chat_id = whoever last sent /start. Stored in api_key.txt as
+ TELEGRAM_OWNER_CHAT_ID=…, so it survives restarts."""
+ val = _read_all_keys(_API_KEY_FILE).get("TELEGRAM_OWNER_CHAT_ID")
+ if not val:
+ return None
try:
- import importlib
- mod = importlib.import_module(module_path)
- mappings = getattr(mod, 'MODEL_MAPPINGS', {})
+ return int(val)
+ except ValueError:
+ return None
+
+
+def _save_owner_chat_id(chat_id: int) -> None:
+ """Persist the owner chat_id so we can message them on the next boot."""
+ _set_key_in_file(_API_KEY_FILE, "TELEGRAM_OWNER_CHAT_ID", str(chat_id))
+
+
+def _get_models_for_provider(provider_id: str) -> list:
+ """Read MODEL_MAPPINGS from Auto_Use/windows_use/llm_provider//view.py
+ and return non-hidden entries as [{id, display_name}, …]."""
+ try:
+ mod = importlib.import_module(
+ f"Auto_Use.windows_use.llm_provider.{provider_id}.view"
+ )
+ mappings = getattr(mod, "MODEL_MAPPINGS", {})
return [
- {'id': model_id, 'display_name': info.get('display_name', model_id)}
- for model_id, info in mappings.items()
- if not info.get('hidden', False)
+ {"id": mid, "display_name": info.get("display_name", mid)}
+ for mid, info in mappings.items()
+ if not info.get("hidden", False)
]
except Exception:
return []
-class TelegramAgentBot:
- """Telegram bot that lets users pick a provider/model and run agent tasks."""
-
- def __init__(self, token: str):
- self._token = token
- self._busy = False
- self._stop_event: Optional[threading.Event] = None
- self._pending: dict = {} # chat_id -> {task, provider, api_key}
-
- # ── helpers ───────────────────────────────────────────────────────────
-
- def _monitor_scratchpad(self, chat_id: int, loop, bot, stop_event: threading.Event):
- """Poll the scratchpad file every 5s and send new lines to the Telegram chat."""
- last_pos = 0
- while not stop_event.is_set():
- if SCRATCHPAD_PATH.exists():
- try:
- with open(SCRATCHPAD_PATH, 'r', encoding='utf-8') as f:
- f.seek(last_pos)
- new_content = f.read()
- if new_content:
- last_pos = f.tell()
- lines = new_content.strip().split('\n')
- for line in lines:
- if line.strip():
- text = line.strip()
- for chunk in [text[i:i+4096] for i in range(0, len(text), 4096)]:
- asyncio.run_coroutine_threadsafe(
- bot.send_message(chat_id=chat_id, text=chunk), loop
- )
- except Exception as exc:
- logger.warning("Scratchpad read error: %s", exc)
- stop_event.wait(5)
-
- # Final sweep
- if SCRATCHPAD_PATH.exists():
+# ── per-chat state ───────────────────────────────────────────────────────────
+
+# chat_id → {
+# "phase": "idle" | "pick_provider" | "pick_model" | "ready" | "running",
+# "provider": str | None,
+# "model": str | None,
+# "model_display": str | None,
+# "queue": list[str], # tasks waiting to run, FIFO
+# "pending": dict[str, str], # pending_id → task awaiting Yes/No
+# "pending_counter": int, # monotonic id source for pending
+# }
+_chat_state: dict = {}
+
+# Guards mutations that read+modify state across threads (queue drain races
+# between _run_agent's finally and the callback handler tapping "Yes").
+_state_lock = threading.Lock()
+
+
+def _state(chat_id: int) -> dict:
+ return _chat_state.setdefault(chat_id, {"phase": "idle"})
+
+
+def _maybe_run_next_queued(chat_id: int, bot, loop) -> None:
+ """If this chat is ready and has a queued task, pop the next one and
+ start it. Threadsafe — called from both _run_agent's finally (worker
+ thread) and the q+ callback (asyncio loop)."""
+ with _state_lock:
+ state = _chat_state.get(chat_id)
+ if not state:
+ return
+ if state.get("phase") != "ready":
+ return
+ queue = state.get("queue") or []
+ if not queue:
+ return
+ provider = state.get("provider")
+ model = state.get("model")
+ if not provider or not model:
+ return
+ next_task = queue.pop(0)
+ display = state.get("model_display") or model
+ state["phase"] = "running"
+
+ _send_chat(
+ bot,
+ chat_id,
+ f"📝 Running queued task: {next_task[:200]} ({provider} · {display})",
+ loop,
+ )
+ threading.Thread(
+ target=_run_agent,
+ args=(next_task, provider, model, chat_id, bot, loop),
+ daemon=True,
+ name=f"telegram-agent-{chat_id}-queued",
+ ).start()
+
+
+# ── Telegram handlers ────────────────────────────────────────────────────────
+
+def _build_online_text(providers: list) -> str:
+ now_str = datetime.datetime.now().strftime("%H:%M:%S")
+ if providers:
+ provider_line = ", ".join(p["id"] for p in providers)
+ return f"🟢 AutoUse online at {now_str}\nProviders: {provider_line}"
+ return f"🟢 AutoUse online at {now_str}\nProviders: (none configured)"
+
+
+async def _show_provider_picker(message):
+ providers = _get_available_providers()
+ # Always lead with the "AutoUse online" status line so the user gets the
+ # same greeting they'd see at app boot, even when they message the bot
+ # first instead of waiting for the unsolicited startup announcement.
+ await message.reply_text(_build_online_text(providers))
+ if not providers:
+ await message.reply_text(
+ "⚠️ No provider API keys found. Add at least one (e.g. "
+ "OPENROUTER_API_KEY=…) to api_key.txt or .env and try again."
+ )
+ return False
+ buttons = [
+ [InlineKeyboardButton(p["id"], callback_data=f"provider:{p['id']}")]
+ for p in providers
+ ]
+ await message.reply_text(
+ "👋 Pick a provider:", reply_markup=InlineKeyboardMarkup(buttons)
+ )
+ return True
+
+
+async def _discover_owner_from_updates(bot) -> int | None:
+ """Peek at the latest pending update on Telegram's servers and use its
+ chat_id as the owner. Lets the bot self-bootstrap on the very first run
+ after the chat-saving code was deployed, without requiring the user to
+ /start again. Safe to call before start_polling — uses offset=-1 which
+ Telegram supports as 'just the most recent update', and doesn't consume
+ updates from the polling updater's offset cursor."""
+ try:
+ updates = await bot.get_updates(offset=-1, limit=1, timeout=2)
+ except Exception:
+ logger.warning("owner discovery: get_updates failed", exc_info=True)
+ return None
+ for upd in updates:
+ chat = getattr(upd, "effective_chat", None)
+ if chat and chat.id:
+ return int(chat.id)
+ return None
+
+
+async def _post_init(application) -> None:
+ """Fires once after the bot finishes initialising (before polling starts).
+ Used to message the saved owner: 'AutoUse online at …' + a fresh provider
+ picker — so the user doesn't have to send anything to get going."""
+ owner_id = _resolve_owner_chat_id()
+ if not owner_id:
+ # Not saved yet — try to auto-discover from Telegram's pending updates.
+ # Works if the user has ever messaged the bot, even before the
+ # chat-saving code was deployed. Persist the result so we don't need
+ # to re-discover on every boot.
+ owner_id = await _discover_owner_from_updates(application.bot)
+ if owner_id:
try:
- with open(SCRATCHPAD_PATH, 'r', encoding='utf-8') as f:
- f.seek(last_pos)
- new_content = f.read()
- if new_content:
- for line in new_content.strip().split('\n'):
- if line.strip():
- asyncio.run_coroutine_threadsafe(
- bot.send_message(chat_id=chat_id, text=line.strip()), loop
- )
+ _save_owner_chat_id(owner_id)
+ logger.info(
+ "owner discovery: saved chat_id=%s from getUpdates",
+ owner_id,
+ )
except Exception:
- pass
+ logger.warning("owner discovery: could not persist chat_id", exc_info=True)
+ if not owner_id:
+ # No owner anywhere — they've never interacted with the bot. Stay
+ # silent; they'll register themselves with /start.
+ return
+ bot = application.bot
+ providers = _get_available_providers()
+ try:
+ await bot.send_message(chat_id=owner_id, text=_build_online_text(providers))
+ except Exception:
+ logger.exception("startup announcement: failed to send hello")
+ return # if we can't even greet, don't bother with the picker
- def _run_agent(self, task: str, provider: str, model: str, api_key: str,
- chat_id: int, loop, bot):
+ if not providers:
try:
- from ...agent.service import AgentService
-
- agent = AgentService(
- provider=provider,
- model=model,
- save_conversation=True,
- thinking=True,
- api_key=api_key,
- stop_event=self._stop_event,
+ await bot.send_message(
+ chat_id=owner_id,
+ text="⚠️ No provider API keys found. Add at least one to api_key.txt and /reset.",
)
+ except Exception:
+ pass
+ return
- monitor = threading.Thread(
- target=self._monitor_scratchpad,
- args=(chat_id, loop, bot, self._stop_event),
- daemon=True,
- )
- monitor.start()
-
- agent.process_request(task)
-
- asyncio.run_coroutine_threadsafe(
- bot.send_message(chat_id=chat_id, text="✅ Task completed."), loop
- )
- except Exception as exc:
- logger.exception("Agent error")
- asyncio.run_coroutine_threadsafe(
- bot.send_message(chat_id=chat_id, text=f"❌ Agent error: {exc}"), loop
- )
- finally:
- self._busy = False
- self._stop_event = None
- self._pending.pop(chat_id, None)
+ buttons = [
+ [InlineKeyboardButton(p["id"], callback_data=f"provider:{p['id']}")]
+ for p in providers
+ ]
+ try:
+ await bot.send_message(
+ chat_id=owner_id,
+ text="👋 Pick a provider:",
+ reply_markup=InlineKeyboardMarkup(buttons),
+ )
+ # Park the owner's chat in pick_provider so the next button tap routes
+ # cleanly through the existing callback flow.
+ _chat_state[owner_id] = {"phase": "pick_provider"}
+ except Exception:
+ logger.exception("startup announcement: failed to send provider picker")
- # ── Telegram handlers ────────────────────────────────────────────────
- async def start_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
+async def start_cmd(update, ctx):
+ chat_id = update.effective_chat.id
+ # Remember this chat so future boots can auto-greet (startup announcement).
+ # Best-effort — never let a file-write failure block /start.
+ try:
+ _save_owner_chat_id(chat_id)
+ except Exception:
+ logger.warning("could not persist owner chat_id", exc_info=True)
+ _chat_state[chat_id] = {"phase": "pick_provider"}
+ ok = await _show_provider_picker(update.message)
+ if not ok:
+ _chat_state[chat_id] = {"phase": "idle"}
+
+
+async def reset_cmd(update, ctx):
+ # Wipe state for this chat — including any queued tasks and pending
+ # awaiting Yes/No prompts. We do NOT clear the persisted owner chat_id;
+ # /reset is "start over the conversation", not "forget I exist".
+ _chat_state[update.effective_chat.id] = {"phase": "idle"}
+ await update.message.reply_text(
+ "🔄 Reset. Send any message to pick a provider again."
+ )
+
+
+async def text_handler(update, ctx):
+ chat_id = update.effective_chat.id
+ # Persist on every message, not just /start, so the next app boot can
+ # auto-announce "AutoUse online" without the user having to /start first.
+ try:
+ _save_owner_chat_id(chat_id)
+ except Exception:
+ logger.warning("could not persist owner chat_id", exc_info=True)
+ state = _state(chat_id)
+ phase = state.get("phase", "idle")
+
+ if phase in ("idle", "pick_provider"):
+ state["phase"] = "pick_provider"
+ ok = await _show_provider_picker(update.message)
+ if not ok:
+ state["phase"] = "idle"
+ return
+
+ if phase == "pick_model":
await update.message.reply_text(
- "👋 Send me a task and I will execute it on the desktop.\n\n"
- "Commands:\n"
- "/stop – abort current task\n"
- "/status – check if a task is running"
+ "Pick a model from the buttons above first."
)
+ return
- async def stop_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
- if self._stop_event and self._busy:
- self._stop_event.set()
- await update.message.reply_text("🛑 Stop signal sent.")
- else:
- await update.message.reply_text("No task is running.")
-
- async def status_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
- if self._busy:
- await update.message.reply_text("⏳ A task is currently running. Send /stop to cancel.")
- else:
- await update.message.reply_text("💤 Idle – send a message to start a task.")
-
- async def task_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
- """User sends a text message → store it as a pending task, show provider buttons."""
- if self._busy:
- await update.message.reply_text(
- "⏳ A task is already running. Send /stop first, then try again."
- )
- return
-
- task = update.message.text.strip()
+ if phase == "running":
+ # Busy — offer to queue this task. Each pending prompt gets a unique
+ # id so multiple "queue this?" prompts can coexist if the user spams.
+ task = (update.message.text or "").strip()
if not task:
return
-
- providers = _get_available_providers()
- if not providers:
- await update.message.reply_text(
- "⚠️ No API keys configured.\n"
- "Add provider API keys through the Auto Use desktop app settings first."
+ state.setdefault("pending", {})
+ state["pending_counter"] = state.get("pending_counter", 0) + 1
+ pending_id = str(state["pending_counter"])
+ state["pending"][pending_id] = task
+ buttons = [[
+ InlineKeyboardButton("✅ Yes, queue it", callback_data=f"q+:{pending_id}"),
+ InlineKeyboardButton("❌ No", callback_data=f"q-:{pending_id}"),
+ ]]
+ await update.message.reply_text(
+ f"⏳ Currently busy performing a task.\n"
+ f"Do you want to queue: \"{task[:200]}\" ?",
+ reply_markup=InlineKeyboardMarkup(buttons),
+ )
+ return
+
+ # phase == "ready"
+ task = (update.message.text or "").strip()
+ if not task:
+ return
+ state["phase"] = "running"
+ provider = state["provider"]
+ model = state["model"]
+ display = state.get("model_display", model)
+ await update.message.reply_text(
+ f"📝 Running: {task} ({provider} · {display})"
+ )
+ bot = ctx.bot
+ loop = asyncio.get_running_loop()
+ threading.Thread(
+ target=_run_agent,
+ args=(task, provider, model, chat_id, bot, loop),
+ daemon=True,
+ ).start()
+
+
+async def callback_handler(update, ctx):
+ query = update.callback_query
+ await query.answer()
+ chat_id = query.message.chat_id
+ try:
+ _save_owner_chat_id(chat_id)
+ except Exception:
+ logger.warning("could not persist owner chat_id", exc_info=True)
+ state = _state(chat_id)
+ data = query.data or ""
+
+ if data.startswith("provider:"):
+ provider_id = data.split(":", 1)[1]
+ state["provider"] = provider_id
+ state["phase"] = "pick_model"
+ models = _get_models_for_provider(provider_id)
+ if not models:
+ state["phase"] = "pick_provider"
+ await query.edit_message_text(
+ f"⚠️ No models found for {provider_id}. Pick another provider."
)
return
-
- chat_id = update.effective_chat.id
- self._pending[chat_id] = {'task': task}
-
buttons = [
- [InlineKeyboardButton(p['id'], callback_data=f"provider:{p['id']}")]
- for p in providers
+ [InlineKeyboardButton(m["display_name"], callback_data=f"model:{m['id']}")]
+ for m in models
]
- await update.message.reply_text(
- f"📝 Task received:\n{task}\n\nChoose a provider:",
+ await query.edit_message_text(
+ f"Pick a model for {provider_id}:",
reply_markup=InlineKeyboardMarkup(buttons),
)
+ return
+
+ if data.startswith("model:"):
+ model_id = data.split(":", 1)[1]
+ provider_id = state.get("provider")
+ if not provider_id:
+ state["phase"] = "idle"
+ await query.edit_message_text("Session expired. Send any message to start over.")
+ return
+ models = _get_models_for_provider(provider_id)
+ display = next(
+ (m["display_name"] for m in models if m["id"] == model_id), model_id
+ )
+ state["model"] = model_id
+ state["model_display"] = display
+ state["phase"] = "ready"
+ await query.edit_message_text(
+ f"✅ Provider: {provider_id} / Model: {display}\n"
+ f"Send me a task whenever you're ready."
+ )
+ return
- async def callback_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE):
- """Handle inline-keyboard button presses for provider/model selection."""
- query = update.callback_query
- await query.answer()
- chat_id = query.message.chat_id
- data = query.data
-
- pending = self._pending.get(chat_id)
- if not pending:
- await query.edit_message_text("Session expired. Send a new task.")
+ if data.startswith("q+:"):
+ # User wants to queue the pending task.
+ pending_id = data.split(":", 1)[1]
+ task = (state.get("pending") or {}).pop(pending_id, None)
+ if not task:
+ await query.edit_message_text("(That prompt has already been handled.)")
return
+ state.setdefault("queue", []).append(task)
+ qlen = len(state["queue"])
+ await query.edit_message_text(
+ f"📥 Queued (position {qlen}): \"{task[:200]}\"\n"
+ f"Will run when the current task finishes."
+ )
+ # Edge case: agent finished in the milliseconds between the prompt
+ # being sent and the user tapping Yes. Drain the queue now so the
+ # queued task isn't stranded.
+ _maybe_run_next_queued(chat_id, ctx.bot, asyncio.get_running_loop())
+ return
+
+ if data.startswith("q-:"):
+ # User declines to queue. Drop the pending task.
+ pending_id = data.split(":", 1)[1]
+ (state.get("pending") or {}).pop(pending_id, None)
+ await query.edit_message_text(
+ "👍 OK, won't queue it. I'll let you know once the current task is done."
+ )
+ return
- if data.startswith("provider:"):
- provider_id = data.split(":", 1)[1]
- providers = _get_available_providers()
- api_key = next((p['key'] for p in providers if p['id'] == provider_id), None)
- if not api_key:
- await query.edit_message_text("⚠️ API key for this provider is no longer available.")
- self._pending.pop(chat_id, None)
- return
-
- pending['provider'] = provider_id
- pending['api_key'] = api_key
-
- models = _get_models_for_provider(provider_id)
- if not models:
- await query.edit_message_text(f"⚠️ No models found for {provider_id}.")
- self._pending.pop(chat_id, None)
- return
-
- buttons = [
- [InlineKeyboardButton(m['display_name'], callback_data=f"model:{m['id']}")]
- for m in models
- ]
- await query.edit_message_text(
- f"Provider: {provider_id}\n\nChoose a model:",
- reply_markup=InlineKeyboardMarkup(buttons),
- )
- elif data.startswith("model:"):
- model_id = data.split(":", 1)[1]
- provider = pending.get('provider')
- api_key = pending.get('api_key')
- task = pending.get('task')
+# ── scratchpad streaming ─────────────────────────────────────────────────────
+
+def _send_chat(bot, chat_id, text, loop, wait: bool = False, timeout: float = 5.0):
+ """Schedule a bot.send_message on the asyncio loop from a worker thread.
+ Silently ignores failures so a transient send error never kills the
+ monitor thread.
+
+ When wait=True, block the calling thread until the send actually
+ completes (or `timeout` seconds elapse). Used for terminal messages
+ like "✅ Done." that must land in the chat BEFORE the next message
+ is scheduled — without it, the "Done" send and the "Running queued
+ task" send race inside the asyncio loop as two parallel HTTP POSTs
+ and Telegram can deliver them out of order."""
+ try:
+ fut = asyncio.run_coroutine_threadsafe(
+ bot.send_message(chat_id=chat_id, text=text), loop
+ )
+ if wait:
+ try:
+ fut.result(timeout=timeout)
+ except Exception:
+ logger.warning(
+ "send_message to chat %s did not confirm within %ss",
+ chat_id, timeout, exc_info=True,
+ )
+ except Exception:
+ logger.warning("Failed to schedule send_message to chat %s", chat_id)
+
+
+def _monitor_scratchpad(chat_id, bot, loop, stop_event, start_pos):
+ """Tail SCRATCHPAD_PATH and forward each new non-empty line to the chat.
+
+ Polls every SCRATCHPAD_POLL_SEC seconds. start_pos is the byte offset
+ the file was at when the task began — we only forward content written
+ AFTER that, so old milestones from previous tasks aren't replayed.
+ Exits when stop_event is set, after one final sweep to flush any tail.
+ """
+ last_pos = start_pos
+
+ def _read_and_forward():
+ nonlocal last_pos
+ if not SCRATCHPAD_PATH.exists():
+ # File was deleted (e.g. AgentService.__init__ wiping the
+ # scratchpad). Reset so the next poll re-reads the whole new
+ # file from the top instead of seeking past its end.
+ last_pos = 0
+ return
+ try:
+ # Defensive: if the file shrank below last_pos it was truncated
+ # or rotated; restart from byte 0 so we don't slice into the
+ # middle of fresh content and stream a fragment.
+ try:
+ current_size = SCRATCHPAD_PATH.stat().st_size
+ if current_size < last_pos:
+ last_pos = 0
+ except Exception:
+ pass
+ with open(SCRATCHPAD_PATH, "r", encoding="utf-8", errors="replace") as f:
+ f.seek(last_pos)
+ new_content = f.read()
+ if not new_content:
+ return
+ last_pos = f.tell()
+ except Exception as exc:
+ logger.warning("Scratchpad read error: %s", exc)
+ return
+ for raw in new_content.splitlines():
+ line = raw.strip()
+ if not line:
+ continue
+ # Chunk excessively long lines so we stay under Telegram's 4096 cap.
+ for i in range(0, len(line), MAX_TG_MSG_LEN):
+ _send_chat(bot, chat_id, line[i : i + MAX_TG_MSG_LEN], loop)
+
+ while not stop_event.is_set():
+ _read_and_forward()
+ stop_event.wait(SCRATCHPAD_POLL_SEC)
+
+ # Final sweep — catches any line written between the last poll and the
+ # stop_event being set (e.g. the agent's very last milestone).
+ _read_and_forward()
+
+
+# ── agent runner (worker thread) ─────────────────────────────────────────────
+
+def _run_agent(task, provider, model, chat_id, bot, loop):
+ """Run the agent and ping the chat when done. Streams scratchpad milestones
+ back to the chat live while the agent works. Pops a compact pill so the
+ Windows user can see a Telegram task is running, and minimises the main
+ app window so the agent has the screen to itself. Restores phase to
+ 'ready'."""
+ # Compact "Telegram task in progress" indicator + minimise AutoUse window.
+ # Both are best-effort — never let UI fluff block the actual task.
+ from Auto_Use.windows_use.remote_connection.telegram.banner import StatusBanner
+ task_banner = StatusBanner(compact=True)
+ try:
+ task_banner.show()
+ except Exception:
+ logger.warning("could not show task banner", exc_info=True)
+ # Minimise the AutoUse pywebview window so the agent has the screen to
+ # itself. We talk to pywebview directly via its global `windows` list
+ # rather than importing from app.py — `python app.py` makes app.py the
+ # __main__ module, so `from app import …` would re-import a *second*
+ # copy of app.py whose webview_window is still None, and the call would
+ # silently no-op.
+ try:
+ import webview as _webview
+ if _webview.windows:
+ _webview.windows[0].minimize()
+ except Exception:
+ logger.warning("could not minimise AutoUse window", exc_info=True)
+
+ # Reset the milestone scratchpad to empty before starting the monitor.
+ # AgentService.__init__ wipes the entire scratchpad/ directory in
+ # _cleanup_scratchpad() — so if we snapshotted the file's current size
+ # here and the agent then deleted + rewrote it, the monitor's last_pos
+ # would point mid-way into the fresh content and we'd stream a
+ # fragment (e.g. "ome." instead of "Verified: …Edge.") to the chat.
+ # Deleting the file ourselves up front and starting from byte 0 keeps
+ # the monitor aligned with whatever the agent writes next. Best-effort
+ # — a failure here just degrades us back to the old (buggy) behavior.
+ try:
+ if SCRATCHPAD_PATH.exists():
+ SCRATCHPAD_PATH.unlink()
+ except Exception:
+ logger.warning("could not reset milestone scratchpad", exc_info=True)
+ start_pos = 0
+ stop_event = threading.Event()
+ monitor = threading.Thread(
+ target=_monitor_scratchpad,
+ args=(chat_id, bot, loop, stop_event, start_pos),
+ daemon=True,
+ name=f"telegram-scratchpad-{chat_id}",
+ )
+ monitor.start()
+
+ try:
+ # Imported lazily — pulls in tree/element → skimage etc., which we
+ # don't want to load until a task actually runs.
+ from Auto_Use.windows_use.agent.service import AgentService
+
+ # Look up the runtime API key for the chosen provider so
+ # LLMManager doesn't fall back to an os.getenv() that the user
+ # never set. Mirrors app.py's get_provider_api_key path —
+ # Telegram users edit api_key.txt (or use the AutoUse Settings
+ # panel), they don't export env vars, so without passing
+ # `api_key=` here the agent dies before its first scan with
+ # "X API key not provided and not found in .env file".
+ # _get_available_providers already gated the picker to non-
+ # empty keys, so the lookup is guaranteed to return a value.
+ provider_key_name = PROVIDER_KEY_MAP.get(provider)
+ provider_keys = _read_all_keys(_API_KEY_FILE)
+ provider_api_key = (
+ provider_keys.get(provider_key_name) if provider_key_name else None
+ )
- if not all([provider, api_key, task]):
- await query.edit_message_text("Session expired. Send a new task.")
- self._pending.pop(chat_id, None)
- return
+ agent = AgentService(
+ provider=provider,
+ model=model,
+ save_conversation=False,
+ thinking=True,
+ api_key=provider_api_key,
+ )
+ agent.process_request(task)
+ # Stop the monitor BEFORE the done message so the final scratchpad
+ # sweep happens first — keeps the chat in correct chronological order.
+ stop_event.set()
+ monitor.join(timeout=SCRATCHPAD_POLL_SEC + 2)
+ # wait=True: block until "✅ Done." is on Telegram's servers before
+ # the finally-block fires _maybe_run_next_queued, which would
+ # otherwise schedule "📝 Running queued task: …" as a second,
+ # concurrent HTTP POST that can race past Done in delivery.
+ _send_chat(bot, chat_id, "✅ Done.", loop, wait=True)
+ except Exception as e:
+ logger.exception("agent error")
+ stop_event.set()
+ monitor.join(timeout=SCRATCHPAD_POLL_SEC + 2)
+ _send_chat(bot, chat_id, f"❌ Error: {e}", loop, wait=True)
+ finally:
+ if not stop_event.is_set():
+ stop_event.set()
+ try:
+ task_banner.close()
+ except Exception:
+ pass
+ with _state_lock:
+ state = _chat_state.get(chat_id)
+ if state is not None and state.get("phase") == "running":
+ state["phase"] = "ready"
+ # Drain one queued task if any — keeps phase='running' if it spawns.
+ _maybe_run_next_queued(chat_id, bot, loop)
+
+
+# ── entry points ─────────────────────────────────────────────────────────────
+
+def _build_telegram_app(token: str):
+ """Build a python-telegram-bot Application with all our handlers wired.
+
+ `post_init` is the hook python-telegram-bot calls once after the bot
+ finishes initialising but before polling starts — perfect spot to send
+ the "AutoUse online" announcement + provider picker to the saved owner.
+ """
+ app = (
+ Application.builder()
+ .token(token)
+ .post_init(_post_init)
+ .build()
+ )
+ app.add_handler(CommandHandler("start", start_cmd))
+ app.add_handler(CommandHandler("reset", reset_cmd))
+ app.add_handler(CallbackQueryHandler(callback_handler))
+ app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, text_handler))
+ return app
+
+
+_BOT_THREAD: threading.Thread | None = None
+
+
+def _stderr(msg: str) -> None:
+ """Loud print to the terminal where python app.py is running — bypasses
+ whatever logging config is in effect so the user actually sees it."""
+ import sys
+ print(f"[telegram] {msg}", file=sys.stderr, flush=True)
+
+
+async def _run_bot_until_stopped(tg_app):
+ """Manual lifecycle replacement for Application.run_polling().
+
+ run_polling() messes with signals and assumes it owns the main thread;
+ we want to drive it from a worker thread so we do it step by step.
+
+ Order matches what run_polling() does internally:
+ initialize → start → post_init → start_polling.
+ We call _post_init BEFORE start_polling so its bot.get_updates(offset=-1)
+ auto-discovery doesn't race with the updater's own polling loop.
+ """
+ await tg_app.initialize()
+ await tg_app.start()
+ # Application.post_init() is only invoked by run_polling(), not by the
+ # manual initialize+start path above. Call our startup announcement
+ # explicitly so the saved owner gets the "AutoUse online" message.
+ try:
+ await _post_init(tg_app)
+ except Exception:
+ logger.exception("post_init failed")
+ await tg_app.updater.start_polling(allowed_updates=Update.ALL_TYPES)
+ _stderr("polling loop is live — send your bot a message")
+ # Park here forever (daemon thread; killed on app exit).
+ await asyncio.Event().wait()
+
+
+def start_bot() -> None:
+ """Start the Telegram bot polling on a daemon thread.
+
+ Idempotent — safe to call multiple times from app.py boot. Prints loudly
+ to stderr at each milestone so the user can see what's happening.
+ """
+ global _BOT_THREAD
+ if _BOT_THREAD is not None and _BOT_THREAD.is_alive():
+ _stderr("start_bot() called but the bot is already running — skipping.")
+ return
+ token = _resolve_token()
+ if not token:
+ _stderr(
+ "BOT NOT STARTED — TELEGRAM_BOT_TOKEN not found in env, .env, or "
+ "api_key.txt. Paste your @BotFather token into one of those files."
+ )
+ return
+ _stderr(f"starting bot (token ends in …{token[-6:]})")
- self._busy = True
- self._stop_event = threading.Event()
+ def _runner():
+ import sys, traceback
+ try:
+ # Each thread needs its own asyncio event loop. Without this, the
+ # call to asyncio.Event() inside _run_bot_until_stopped fails.
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ tg_app = _build_telegram_app(token)
+ try:
+ loop.run_until_complete(_run_bot_until_stopped(tg_app))
+ finally:
+ loop.close()
+ except Exception as e:
+ _stderr(f"BOT CRASHED: {e!r}")
+ traceback.print_exc(file=sys.stderr)
+
+ _BOT_THREAD = threading.Thread(target=_runner, daemon=True, name="telegram-bot")
+ _BOT_THREAD.start()
+
+
+def main():
+ """Standalone entry — for testing without launching the full AutoUse app."""
+ token = _resolve_token()
+ if not token:
+ raise SystemExit(
+ f"TELEGRAM_BOT_TOKEN not found in {_API_KEY_FILE}\n"
+ "(create the bot via @BotFather first, then add the token to that file)."
+ )
+ tg_app = _build_telegram_app(token)
+ logger.info("Telegram bot polling started (main thread)")
+ tg_app.run_polling(allowed_updates=Update.ALL_TYPES)
- await query.edit_message_text("🤔 Thinking...")
- loop = asyncio.get_running_loop()
- bot = context.bot
- thread = threading.Thread(
- target=self._run_agent,
- args=(task, provider, model_id, api_key, chat_id, loop, bot),
- daemon=True,
- )
- thread.start()
-
- # ── public entry point ───────────────────────────────────────────────
-
- def run(self):
- """Start polling (blocking). Called from a thread by app.py."""
- app = Application.builder().token(self._token).build()
- app.add_handler(CommandHandler("start", self.start_handler))
- app.add_handler(CommandHandler("stop", self.stop_handler))
- app.add_handler(CommandHandler("status", self.status_handler))
- app.add_handler(CallbackQueryHandler(self.callback_handler))
- app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, self.task_handler))
-
- logger.info("Telegram bot polling started")
- app.run_polling(allowed_updates=Update.ALL_TYPES)
-
- def stop(self):
- """Signal any running agent to stop."""
- if self._stop_event:
- self._stop_event.set()
+if __name__ == "__main__":
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+ )
+ main()
diff --git a/Auto_Use/windows_use/remote_connection/telegram/setup.py b/Auto_Use/windows_use/remote_connection/telegram/setup.py
new file mode 100644
index 0000000..3edaba2
--- /dev/null
+++ b/Auto_Use/windows_use/remote_connection/telegram/setup.py
@@ -0,0 +1,203 @@
+# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# If you build on this project, please keep this header and credit
+# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
+# A small attribution goes a long way toward a healthy open-source
+# community — thank you for contributing.
+
+"""Telegram remote-connection setup driver (Windows, guided mode).
+
+Opens Microsoft Edge, navigates to web.telegram.org, then lets the user log
+in manually. Progress is paced by a small always-on-top banner that streams
+status text and has a Next button. The script blocks on user clicks via
+banner.wait_for_next() — the user does the actual login (phone, country,
+OTP) themselves; we just get them to the right page.
+"""
+import logging
+import os
+import threading
+import time
+
+from Auto_Use.windows_use.controller.tool.open_app import open_on_windows
+from Auto_Use.windows_use.tree.element import UIElementScanner, ELEMENT_CONFIG
+from Auto_Use.windows_use.controller.service import ControllerService
+from Auto_Use.windows_use.controller.key_combo.service import KeyComboService
+from Auto_Use.windows_use.remote_connection.telegram.banner import StatusBanner
+from Auto_Use.windows_use.remote_connection.telegram.service import (
+ _API_KEY_FILE, _set_key_in_file,
+)
+
+logger = logging.getLogger(__name__)
+
+TELEGRAM_WEB_URL = "web.telegram.org"
+STEP_DELAY_SEC = 2
+
+# Singleton guard — /api/telegram/connect spawns a fresh daemon thread on
+# every POST, so a rapid double-click or polling-induced re-fire would
+# otherwise launch parallel banner wizards. We let the redundant calls
+# return immediately while the first one runs to completion.
+_SETUP_LOCK = threading.Lock()
+_SETUP_ACTIVE = False
+
+# Edge candidates tried in order — `open_on_windows` does fuzzy matching, but
+# different Windows installs surface Edge under slightly different Start-Menu
+# entries (PWA shortcut vs. "Microsoft Edge.lnk" vs. plain `msedge.exe` on
+# PATH). Try the cleanest one first; fall back to broader strings.
+EDGE_NAME_CANDIDATES = ("msedge", "Microsoft Edge", "edge")
+
+
+def _find_address_bar(mapping: dict) -> str | None:
+ """Return the index of Edge's address bar, or None if not found.
+
+ On Edge the address bar surfaces in the UIA tree as
+ `name="Address and search bar", type="Edit"` — confirmed from a live
+ scan saved at debug/element/ui_elements_1778913911.txt:8.
+ """
+ for idx, info in mapping.items():
+ if info.get("name") == "Address and search bar" and info.get("type") == "Edit":
+ return idx
+ return None
+
+
+def _launch_edge() -> bool:
+ """Try the Edge name variants in order; return True on the first success."""
+ for name in EDGE_NAME_CANDIDATES:
+ try:
+ if open_on_windows(name):
+ return True
+ except Exception:
+ logger.warning("open_on_windows(%r) raised", name, exc_info=True)
+ return False
+
+
+def _open_telegram_in_edge(banner) -> bool:
+ """Launch Edge and navigate it to web.telegram.org.
+
+ Streams sub-step status to the banner so the user can see what's happening
+ while Edge takes focus. Returns False on any failure.
+ """
+ banner.update("Please wait — confirming Edge is open…")
+ if not _launch_edge():
+ logger.error("setup.py: failed to launch Microsoft Edge")
+ return False
+ # open_on_windows already sleeps a moment after launching, but the
+ # address bar isn't reliably populated in the UIA tree immediately —
+ # give Edge another beat to settle before we scan.
+ time.sleep(1)
+
+ scanner = UIElementScanner(ELEMENT_CONFIG)
+ scanner.scan_elements()
+ mapping = scanner.get_elements_mapping()
+ time.sleep(STEP_DELAY_SEC)
+
+ address_bar_index = _find_address_bar(mapping)
+ if address_bar_index is None:
+ logger.error("setup.py: Edge address bar not found in scan")
+ return False
+
+ banner.update("Edge detected. Writing the URL for you, please wait…")
+
+ controller = ControllerService()
+ controller.set_elements(mapping, scanner.application_name)
+ key_combo = KeyComboService()
+
+ controller.click(address_bar_index)
+ time.sleep(STEP_DELAY_SEC)
+
+ controller.canvas_input(TELEGRAM_WEB_URL)
+ time.sleep(STEP_DELAY_SEC)
+
+ key_combo.send("return")
+ return True
+
+
+def run(country_code: str = "", phone: str = "") -> bool:
+ """Guided Telegram-Web pairing.
+
+ Shows a banner, waits for the user to click Next, opens Telegram Web,
+ waits for the user to log in manually + click Next, then closes.
+
+ country_code and phone are accepted but ignored — kept only so the
+ pre-existing /api/telegram/connect callsite signature still works.
+
+ Idempotent under concurrent calls: if a wizard is already running,
+ redundant invocations return False immediately so we don't end up
+ with N parallel banners in the taskbar.
+ """
+ global _SETUP_ACTIVE
+ with _SETUP_LOCK:
+ if _SETUP_ACTIVE:
+ logger.info(
+ "setup.run: wizard already running — ignoring duplicate Connect"
+ )
+ return False
+ _SETUP_ACTIVE = True
+
+ banner = StatusBanner()
+ banner.show()
+ try:
+ banner.update("Let's get you set up with Telegram. Please click Next.")
+ if not banner.wait_for_next():
+ return False
+
+ if not _open_telegram_in_edge(banner):
+ banner.update("Failed to open Telegram. Close this banner and try again.")
+ banner.wait_for_next(timeout=15)
+ return False
+
+ banner.update("Please log in to Telegram, then click Next")
+ if not banner.wait_for_next():
+ return False
+
+ banner.update(
+ "Now search for @BotFather in Telegram and open the chat. "
+ "Click Next when you're there."
+ )
+ if not banner.wait_for_next():
+ return False
+
+ banner.update("How do you want to set up the bot?")
+ choice = banner.wait_for_choice("Fresh setup", "Token already generated")
+
+ if choice == "left":
+ banner.update(
+ "In @BotFather, send these one at a time: /newbot → AutoUse → "
+ "a unique bot name. BotFather will reply with your token. "
+ "Click Next when you have it."
+ )
+ if not banner.wait_for_next():
+ return False
+
+ banner.update("Paste your BotFather token below and click Save.")
+ token = banner.wait_for_input(save_label="Save")
+ if not token:
+ return False # banner never appeared or user closed it
+
+ _set_key_in_file(_API_KEY_FILE, "TELEGRAM_BOT_TOKEN", token.strip())
+
+ banner.update("Saved. Restarting AutoUse to start the bot…")
+ # Give the message time to stream out + a beat for the user to read
+ # it, then hard-exit the whole process. The user's next `python
+ # app.py` boot picks up the fresh TELEGRAM_BOT_TOKEN and the bot
+ # comes online with the saved owner chat. os._exit skips atexit /
+ # finally cleanup, which is what we want — the tk loop will be torn
+ # down as the process dies.
+ time.sleep(3)
+ banner.close()
+ os._exit(0)
+ finally:
+ banner.close()
+ with _SETUP_LOCK:
+ _SETUP_ACTIVE = False
diff --git a/Auto_Use/windows_use/remote_connection/telegram/view.py b/Auto_Use/windows_use/remote_connection/telegram/view.py
index a21c13f..8f4e0c2 100644
--- a/Auto_Use/windows_use/remote_connection/telegram/view.py
+++ b/Auto_Use/windows_use/remote_connection/telegram/view.py
@@ -17,140 +17,141 @@
# A small attribution goes a long way toward a healthy open-source
# community — thank you for contributing.
-import threading
+"""Flask Blueprint for the Windows Telegram surface.
+
+Mirror of the macOS view.py, adapted so app.py's single
+`from ...view import telegram_bp, start_bot` works on Windows. Routes:
+
+ GET /api/telegram/status → {connected, bot_username?}
+ POST /api/telegram/connect → kicks off the guided walkthrough (Edge)
+ POST /api/telegram/disconnect → clears the persisted token
+
+All token lookups read ONLY from api_key.txt. .env is intentionally not
+consulted — the bot treats api_key.txt as its single source of truth.
+"""
+import json
import logging
-import socket
-from pathlib import Path
-from flask import Blueprint, jsonify, request, send_file
+import threading
+import urllib.request
-logger = logging.getLogger(__name__)
+from flask import Blueprint, jsonify
-telegram_bp = Blueprint('telegram', __name__)
+# Re-export start_bot so app.py's
+# from Auto_Use.windows_use.remote_connection.telegram.view import telegram_bp, start_bot
+# works from a single import line, matching app.py:921.
+# _API_KEY_FILE comes from service.py too, which resolves it in a compiled-
+# build-aware way (next to the executable when frozen) — one source of truth.
+from .service import start_bot, _API_KEY_FILE # noqa: F401
+
+logger = logging.getLogger(__name__)
-_bot_instance = None
-_bot_thread = None
-_bot_username_cache = None
+telegram_bp = Blueprint("telegram_windows", __name__)
-# view.py -> telegram -> remote_connection -> windows_use -> Auto_Use / api_key / api_key.txt
-API_KEY_FILE = Path(__file__).parent.parent.parent.parent / "api_key" / "api_key.txt"
-PAIR_HTML = Path(__file__).parent / "pair.html"
+_bot_username_cache: str | None = None
-def _get_local_ip():
+def _read_token() -> str | None:
+ """Pull TELEGRAM_BOT_TOKEN out of api_key.txt. Returns None if missing or
+ empty. Does NOT consult .env or env vars on purpose."""
+ if not _API_KEY_FILE.exists():
+ return None
try:
- s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
- s.connect(("8.8.8.8", 80))
- ip = s.getsockname()[0]
- s.close()
- return ip
+ with open(_API_KEY_FILE, "r", encoding="utf-8") as f:
+ for line in f:
+ stripped = line.strip()
+ if stripped.startswith("TELEGRAM_BOT_TOKEN="):
+ val = stripped.partition("=")[2].strip()
+ return val or None
except Exception:
- return "127.0.0.1"
-
-
-def _read_telegram_token():
- if API_KEY_FILE.exists():
- try:
- with open(API_KEY_FILE, 'r', encoding='utf-8') as f:
- for line in f:
- if line.strip().startswith('TELEGRAM_BOT_TOKEN='):
- _, _, value = line.partition('=')
- return value.strip() or None
- except Exception:
- pass
+ logger.warning("could not read %s", _API_KEY_FILE)
return None
-def _save_telegram_token(token):
- API_KEY_FILE.parent.mkdir(parents=True, exist_ok=True)
+def _set_token(value: str) -> None:
+ """Write/clear TELEGRAM_BOT_TOKEN= in api_key.txt, preserving every other
+ line (incl. empty-value placeholders the AutoUse UI relies on)."""
lines = []
found = False
- if API_KEY_FILE.exists():
- with open(API_KEY_FILE, 'r', encoding='utf-8') as f:
- for line in f:
- if line.strip().startswith('TELEGRAM_BOT_TOKEN='):
- lines.append(f'TELEGRAM_BOT_TOKEN={token}\n')
- found = True
- else:
- lines.append(line)
+ if _API_KEY_FILE.exists():
+ try:
+ with open(_API_KEY_FILE, "r", encoding="utf-8") as f:
+ for raw in f:
+ if raw.strip().startswith("TELEGRAM_BOT_TOKEN="):
+ lines.append(f"TELEGRAM_BOT_TOKEN={value}\n")
+ found = True
+ else:
+ lines.append(raw if raw.endswith("\n") else raw + "\n")
+ except Exception:
+ logger.warning("could not read %s while updating token", _API_KEY_FILE)
+ return
if not found:
- lines.append(f'TELEGRAM_BOT_TOKEN={token}\n')
- with open(API_KEY_FILE, 'w', encoding='utf-8') as f:
- f.writelines(lines)
+ lines.append(f"TELEGRAM_BOT_TOKEN={value}\n")
+ try:
+ _API_KEY_FILE.parent.mkdir(parents=True, exist_ok=True)
+ with open(_API_KEY_FILE, "w", encoding="utf-8") as f:
+ f.writelines(lines)
+ except Exception:
+ logger.warning("could not write %s", _API_KEY_FILE)
-def _fetch_bot_username(token):
+def _fetch_bot_username(token: str) -> str | None:
+ """One-shot call to Telegram's getMe — used by /status so the panel can
+ show '@your_bot' instead of just 'connected'."""
try:
- import urllib.request, json
- resp = urllib.request.urlopen(f'https://api.telegram.org/bot{token}/getMe', timeout=5)
+ resp = urllib.request.urlopen(
+ f"https://api.telegram.org/bot{token}/getMe", timeout=5
+ )
data = json.loads(resp.read())
- if data.get('ok'):
- return data['result'].get('username', '')
+ if data.get("ok"):
+ return data["result"].get("username", "") or None
except Exception:
pass
return None
-def start_bot():
- global _bot_instance, _bot_thread, _bot_username_cache
- if _bot_thread and _bot_thread.is_alive():
- return
- token = _read_telegram_token()
- if not token:
- return
- _bot_username_cache = _fetch_bot_username(token)
- from .service import TelegramAgentBot
- _bot_instance = TelegramAgentBot(token)
- _bot_thread = threading.Thread(target=_bot_instance.run, daemon=True)
- _bot_thread.start()
- logger.info("Telegram bot started (@%s)", _bot_username_cache)
-
-
-def stop_bot():
- global _bot_instance, _bot_thread, _bot_username_cache
- if _bot_instance:
- _bot_instance.stop()
- _bot_instance = None
- _bot_thread = None
- _bot_username_cache = None
-
-
-@telegram_bp.route('/pair')
-def pair_page():
- return send_file(PAIR_HTML)
-
+# ── routes ──────────────────────────────────────────────────────────────────
-@telegram_bp.route('/api/telegram/save-token', methods=['POST'])
-def save_token():
- data = request.get_json()
- token = (data.get('token') or '').strip()
- if not token:
- return jsonify({'error': 'No token provided'}), 400
-
- username = _fetch_bot_username(token)
- if not username:
- return jsonify({'error': 'Invalid token — check and try again'}), 400
-
- _save_telegram_token(token)
- stop_bot()
- start_bot()
- return jsonify({'status': 'connected', 'bot_username': username})
-
-
-@telegram_bp.route('/api/telegram/status')
+@telegram_bp.route("/api/telegram/status", methods=["GET"])
def telegram_status():
- token = _read_telegram_token()
+ """Frontend uses this to decide which Remote Connection panel state to
+ show. If a token is present in api_key.txt → 'connected', and the panel
+ flips to the @bot_username + Disconnect view (Connect button is hidden).
+ Cached so we don't hit Telegram's API on every page load."""
+ global _bot_username_cache
+ token = _read_token()
if not token:
- return jsonify({'connected': False, 'local_ip': _get_local_ip()})
+ _bot_username_cache = None
+ return jsonify({"connected": False})
+ if _bot_username_cache is None:
+ _bot_username_cache = _fetch_bot_username(token) or ""
return jsonify({
- 'connected': True,
- 'bot_username': _bot_username_cache,
- 'running': _bot_thread is not None and _bot_thread.is_alive(),
- 'local_ip': _get_local_ip()
+ "connected": True,
+ "bot_username": _bot_username_cache,
})
-@telegram_bp.route('/api/telegram/disconnect', methods=['POST'])
-def disconnect():
- stop_bot()
- _save_telegram_token('')
- return jsonify({'status': 'disconnected'})
\ No newline at end of file
+@telegram_bp.route("/api/telegram/connect", methods=["POST"])
+def telegram_connect():
+ """Kick off the guided walkthrough (Edge → web.telegram.org → user logs
+ in manually, paced by the floating banner). Returns immediately; the real
+ work runs on a daemon thread since it blocks on user clicks."""
+ try:
+ from Auto_Use.windows_use.remote_connection.telegram.setup import (
+ run as run_telegram_setup,
+ )
+ threading.Thread(target=run_telegram_setup, daemon=True).start()
+ return jsonify({"status": "started"})
+ except Exception as e:
+ logger.exception("telegram_connect failed")
+ return jsonify({"status": "error", "message": str(e)}), 500
+
+
+@telegram_bp.route("/api/telegram/disconnect", methods=["POST"])
+def telegram_disconnect():
+ """Clear the persisted token + the cached @bot_username. The polling
+ thread already running keeps polling until the next app restart (soft
+ disconnect) — clean shutdown of the bot loop is a future enhancement."""
+ global _bot_username_cache
+ _set_token("")
+ _bot_username_cache = None
+ return jsonify({"status": "disconnected"})
diff --git a/README.md b/README.md
index 8e32b67..d97dd1a 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,22 @@
---
+