diff --git a/Auto_Use/macOS_use/remote_connection/telegram/banner.py b/Auto_Use/macOS_use/remote_connection/telegram/banner.py new file mode 100644 index 0000000..0d8e9b9 --- /dev/null +++ b/Auto_Use/macOS_use/remote_connection/telegram/banner.py @@ -0,0 +1,880 @@ +# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# If you build on this project, please keep this header and credit +# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works. +# A small attribution goes a long way toward a healthy open-source +# community — thank you for contributing. + +"""Interactive walkthrough banner for setup.py. + +A small always-on-top pill at the top-right of the screen that contains: + - the animated stop-orb on the left, + - a status message in the middle (multi-line capable; pill grows downward), + - a clickable "Next" button on the right (only visible when the script is + waiting for the user — hidden during processing steps). + +setup.py calls show() once, then alternates update("…") + wait_for_next() +to pace the user. close() tears it down. The Next button is shown +automatically inside wait_for_next() and hidden as soon as it returns, so +callers don't have to manage visibility manually. + +The pill default height is the original 44px. When a long status message +wraps to multiple lines a ResizeObserver in JS posts the new body height +back to Python via a second WKScriptMessageHandler, and Python resizes the +NSWindow (top edge anchored, height grows downward). + +Everything runs inside the existing Python process. pywebview's main-thread +NSApplication run loop (started by webview.start() in app.py) is reused — +AppKit work is dispatched onto it via PyObjCTools.AppHelper.callAfter so the +Flask worker thread that runs setup.py never touches Cocoa directly. + +If Cocoa/PyObjC isn't importable for any reason the class becomes a no-op +so the automation still completes without a banner. +""" +import logging +import threading + +logger = logging.getLogger(__name__) + +try: + from Cocoa import ( + NSPanel, NSColor, NSScreen, + NSBackingStoreBuffered, NSMakeRect, + ) + from Foundation import NSObject + from WebKit import WKWebView, WKWebViewConfiguration + from PyObjCTools.AppHelper import callAfter + _COCOA_OK = True +except Exception as e: + logger.warning(f"banner: Cocoa unavailable, popup disabled ({e})") + _COCOA_OK = False + +# Non-activating panel: clicks inside the WebView do NOT activate the Python +# process, so the AutoUse main pywebview window can't pop over Safari while +# the wizard is running. The panel still becomes key when a text input needs +# keyboard focus (setBecomesKeyOnlyIfNeeded_). +NSWindowStyleMaskNonactivatingPanel = 1 << 7 # 128 +NSStatusWindowLevel = 25 + + +BANNER_HTML = """ + + + +
+
+
+
+
+
+
+
+Starting… + +
+ + +
+
+
+ + +
+
+
+ + +""" + + +# Compact HTML — used when StatusBanner(compact=True). Just the orb in a tiny +# circular pill, no message span, no Next button, no JS message handlers. The +# centred PC monitor icon cross-fades with a Telegram paper-plane every ~5s +# so the user can tell at a glance this is a Telegram-triggered task. +COMPACT_HTML = """ + + + +
+
+
+
+
+
+
+
+
+ +
+
+
+ +""" + + +if _COCOA_OK: + class _NonActivatingPanel(NSPanel): + """Borderless NSPanel that can still become key. + + AppKit returns NO from -canBecomeKeyWindow for borderless panels by + default, which blocks WKWebView text inputs from ever receiving + keyboard focus (the user clicks the field and nothing happens). + Overriding to YES makes the field usable. NSWindowStyleMaskNonactivatingPanel + is still set on the instance, so becoming key still doesn't activate + this Python process — Safari stays in the foreground.""" + def canBecomeKeyWindow(self): + return True + + + class _ClickableWebView(WKWebView): + """WKWebView that returns YES from acceptsFirstMouse:. + + Without this, the first click after the panel loses key status + (e.g. user just clicked Safari) is swallowed by AppKit while it + promotes the panel back to key — the button click never fires, and + the user has to tap a second time. Returning YES tells AppKit to + forward the very first click straight to the view, so single-tap + works regardless of key-window state.""" + def acceptsFirstMouse_(self, event): + return True + + + class _NextHandler(NSObject): + """WKScriptMessageHandler — fires self._event when JS posts to 'next_clicked'. + + No custom init: PyObjC's bridged NSObject.init takes no args, so calling + NSObject.init(self) inside a subclass crashes with "Need 0 arguments, + got 1". Instead, allocate with the default init and set the event as a + plain Python attribute right after — PyObjC subclasses accept arbitrary + Python attributes just fine. + """ + def userContentController_didReceiveScriptMessage_(self, controller, message): + try: + self._event.set() + except Exception: + pass + + class _HeightHandler(NSObject): + """WKScriptMessageHandler — receives body.scrollHeight from JS and calls + the banner's _on_height_changed on the main thread (already the current + thread, since WK message delivery is on main).""" + def userContentController_didReceiveScriptMessage_(self, controller, message): + try: + banner = self._banner + if banner is not None: + banner._on_height_changed(int(message.body())) + except Exception: + pass + + class _ChoiceHandler(NSObject): + """WKScriptMessageHandler for the two-button choice row. Stores the + clicked label ('left' or 'right') on self._value, then fires self._event.""" + def userContentController_didReceiveScriptMessage_(self, controller, message): + try: + self._value = str(message.body()) + self._event.set() + except Exception: + pass + + class _SaveHandler(NSObject): + """WKScriptMessageHandler for the token input. Stores the typed string + on self._value, then fires self._event.""" + def userContentController_didReceiveScriptMessage_(self, controller, message): + try: + self._value = str(message.body()) + self._event.set() + except Exception: + pass + + class _RevealHandler(NSObject): + """WKScriptMessageHandler fired by JS when the word-by-word setMsg + reveal finishes. Used to gate control-set visibility on stream + completion so buttons don't pop in mid-sentence.""" + def userContentController_didReceiveScriptMessage_(self, controller, message): + try: + self._event.set() + except Exception: + pass +else: + _NextHandler = None + _HeightHandler = None + _ChoiceHandler = None + _SaveHandler = None + _RevealHandler = None + + +class StatusBanner: + W, MIN_H, MAX_H, TOP_MARGIN, RIGHT_MARGIN = 440, 44, 200, 56, 20 + # Compact variant: just the orb, no msg / button / scripts. Fixed-size + # circular pill (W == H, radius == W/2). Used for "Telegram task running" + # indicator — pure visual, click-through. Sized to hug the 36 px orb with + # ~4 px breathing room — anything taller and the pill looks padded. + COMPACT_W = COMPACT_H = 44 + + def __init__(self, compact: bool = False): + self._compact = compact + self._window = None + self._webview = None + self._next_handler = None # strong refs so the JS-bridge handlers + self._height_handler = None # don't get GC'd + self._choice_handler = None + self._save_handler = None + self._reveal_handler = None + self._next_event = threading.Event() + self._choice_event = threading.Event() + self._save_event = threading.Event() + # Set initially: no streaming reveal is pending until update() is called. + # update() clears this; the JS reveal_done handler re-sets it. + self._reveal_event = threading.Event() + self._reveal_event.set() + self._current_h = self.COMPACT_H if compact else self.MIN_H + + # ---- public API (callable from any thread) ---- + + def show(self): + if not _COCOA_OK: + return + callAfter(self._create) + + def update(self, text): + # Compact pills have no msg span — silently no-op so callers don't + # have to branch. + if not _COCOA_OK or self._compact: + return + # A streaming reveal is about to start in JS; clear the event so any + # following wait_for_* call blocks until JS posts reveal_done. + self._reveal_event.clear() + callAfter(self._set_text, text) + + # Cap the wait-for-reveal so a JS hiccup that drops the reveal_done + # message can never deadlock us. Realistic banner messages stream out + # in well under this — and shorter is better, because the wait is what + # the user experiences between the message finishing and the button + # showing. + _REVEAL_WAIT_SEC = 3.0 + + def _await_reveal(self): + """Block until the most recent update()'s reveal animation has + finished (or the safety timeout fires). No-op if no update() is + pending — the event stays set in that case.""" + self._reveal_event.wait(self._REVEAL_WAIT_SEC) + + def wait_for_next(self, timeout=None): + """Block calling thread until user clicks Next (or timeout). Returns True if clicked. + + Shows the Next button on entry and hides it on exit, so during normal + update() calls the button stays hidden — only the entry/exit boundaries + of a wait_for_next show a clickable Next. + """ + if not _COCOA_OK: + return True # no banner → don't block forever + if self._compact: + # No Next button in compact mode — return immediately so callers + # that accidentally chain it don't hang forever. + return True + # Clear the click event BEFORE the reveal wait. If we cleared after, + # any click that lands during streaming (rare, since the button is + # hidden until reveal finishes — but defensive) would be wiped here + # and the user would have to click a second time. + self._next_event.clear() + self._await_reveal() + callAfter(self._clear_extra_ui) + callAfter(self._set_next_visible, True) + clicked = self._next_event.wait(timeout) + callAfter(self._set_next_visible, False) + return clicked + + def wait_for_choice(self, left_label, right_label, timeout=None): + """Show two side-by-side buttons; block until one is clicked. + Returns 'left' or 'right', or None on timeout / no Cocoa.""" + if not _COCOA_OK or self._compact: + return None + self._choice_event.clear() + self._await_reveal() + callAfter(self._set_next_visible, False) + callAfter(self._show_choice, left_label, right_label) + clicked = self._choice_event.wait(timeout) + value = getattr(self._choice_handler, "_value", None) if clicked else None + callAfter(self._clear_extra_ui) + return value + + def wait_for_input(self, save_label="Save", validate=None, + error_msg="Token can't be empty"): + """Show a text input + Save button; block until user submits a value + that passes `validate` (default: non-empty after strip). Failed + validation surfaces `error_msg` in red below the input and keeps + waiting. Returns the accepted value, or None on no Cocoa.""" + if not _COCOA_OK or self._compact: + return None + if validate is None: + validate = lambda v: bool((v or "").strip()) + self._save_event.clear() + self._await_reveal() + callAfter(self._set_next_visible, False) + callAfter(self._show_input, save_label) + try: + while True: + self._save_event.wait() + # _destroy() also sets the event — bail out if the banner + # has been torn down out from under us. + if self._webview is None: + return None + value = getattr(self._save_handler, "_value", "") or "" + if validate(value): + return value + callAfter(self._set_input_error, error_msg) + self._save_event.clear() + finally: + callAfter(self._clear_extra_ui) + + def close(self): + if not _COCOA_OK: + return + callAfter(self._destroy) + + # ---- main-thread implementations ---- + + def _create(self): + try: + scr = NSScreen.mainScreen().frame() + if self._compact: + w_px, h_px = self.COMPACT_W, self.COMPACT_H + corner = w_px / 2.0 + html = COMPACT_HTML + ignores_mouse = True # click-through; purely visual + else: + w_px, h_px = self.W, self.MIN_H + corner = self.MIN_H / 2.0 + html = BANNER_HTML + ignores_mouse = False + x = scr.size.width - w_px - self.RIGHT_MARGIN + y = scr.size.height - h_px - self.TOP_MARGIN + rect = NSMakeRect(x, y, w_px, h_px) + + w = _NonActivatingPanel.alloc().initWithContentRect_styleMask_backing_defer_( + rect, NSWindowStyleMaskNonactivatingPanel, + NSBackingStoreBuffered, False, + ) + w.setLevel_(NSStatusWindowLevel) + w.setOpaque_(False) + w.setBackgroundColor_(NSColor.clearColor()) + w.setIgnoresMouseEvents_(ignores_mouse) + w.setHasShadow_(True) + w.setReleasedWhenClosed_(False) + # Panels normally hide when their app deactivates — we want the + # banner to stay visible the entire time Safari is in front. + # Leave becomesKeyOnlyIfNeeded at the NSPanel default (NO) so a + # click on the token input properly makes the panel key and the + # field accepts paste / typing. NonactivatingPanelMask means + # becoming key still doesn't activate the Python process. + try: + w.setHidesOnDeactivate_(False) + except Exception: + pass + + content = w.contentView() + content.setWantsLayer_(True) + content.layer().setBackgroundColor_( + NSColor.colorWithCalibratedRed_green_blue_alpha_(1.0, 1.0, 1.0, 0.96).CGColor() + ) + # Fixed at MIN_H/2 so the pill stays a stadium at default height + # and becomes a rounded-rectangle when the height grows to fit + # multi-line messages — cleaner than a fat oval. In compact mode + # we use W/2 → perfect circle. + content.layer().setCornerRadius_(corner) + content.layer().setMasksToBounds_(True) + + cfg = WKWebViewConfiguration.alloc().init() + + # JS→Python bridges only relevant in standard mode (compact pill + # has no Next button and a fixed size — no need for either handler). + if not self._compact: + nh = _NextHandler.alloc().init() + nh._event = self._next_event + cfg.userContentController().addScriptMessageHandler_name_(nh, "next_clicked") + + hh = _HeightHandler.alloc().init() + hh._banner = self + cfg.userContentController().addScriptMessageHandler_name_(hh, "height_changed") + + ch = _ChoiceHandler.alloc().init() + ch._event = self._choice_event + ch._value = None + cfg.userContentController().addScriptMessageHandler_name_(ch, "choice_clicked") + + sh = _SaveHandler.alloc().init() + sh._event = self._save_event + sh._value = "" + cfg.userContentController().addScriptMessageHandler_name_(sh, "save_clicked") + + rh = _RevealHandler.alloc().init() + rh._event = self._reveal_event + cfg.userContentController().addScriptMessageHandler_name_(rh, "reveal_done") + else: + nh = hh = ch = sh = rh = None + + wv_rect = NSMakeRect(0, 0, w_px, h_px) + wv = _ClickableWebView.alloc().initWithFrame_configuration_(wv_rect, cfg) + try: + wv.setValue_forKey_(False, "drawsBackground") + except Exception: + pass + try: + wv.setWantsLayer_(True) + wv.layer().setBackgroundColor_(NSColor.clearColor().CGColor()) + except Exception: + pass + # NSViewWidthSizable (2) | NSViewHeightSizable (16). When the + # window animates between sizes (multi-line message growing, + # collapsing back to single line), the WebView's frame follows + # the animation instead of snapping — that's what makes the + # pill grow/shrink as a smooth shape. + try: + wv.setAutoresizingMask_(2 | 16) + except Exception: + pass + wv.loadHTMLString_baseURL_(html, None) + content.addSubview_(wv) + + w.orderFrontRegardless() + # Make the panel key on show so the first user click on Next + # registers as the button click — not as "promote panel to key". + # NonActivatingPanelMask means becoming key still doesn't + # activate this Python process, so Safari stays in front. + if not self._compact: + try: + w.makeKeyWindow() + except Exception: + pass + self._window, self._webview = w, wv + self._next_handler, self._height_handler = nh, hh + self._choice_handler, self._save_handler = ch, sh + self._reveal_handler = rh + self._current_h = h_px + except Exception as e: + logger.warning(f"banner: _create failed ({e})") + + def _set_text(self, text): + try: + if self._webview is None: + return + safe = (str(text) + .replace("\\", "\\\\") + .replace("'", "\\'") + .replace("\n", " ") + .replace("\r", " ")) + # Primary path: hand the full text to JS which animates it + # word-by-word and fires reveal_done when finished. Fallback: + # if the page-side script hasn't run yet (window.setMsg is + # undefined — happens for the very first update right after + # the WebView starts loading), set textContent directly and + # post reveal_done ourselves so wait_for_next doesn't sit on + # its safety timeout. + js = (f"if (window.setMsg) {{ setMsg('{safe}'); }}" + f" else {{" + f" var m = document.getElementById('msg');" + f" if (m) m.textContent = '{safe}';" + f" try {{ webkit.messageHandlers.reveal_done.postMessage(1); }}" + f" catch (e) {{}}" + f" }}") + self._webview.evaluateJavaScript_completionHandler_(js, None) + except Exception: + pass + + def _set_next_visible(self, visible): + try: + if self._webview is None: + return + disp = "inline-block" if visible else "none" + js = (f"var b=document.getElementById('next'); " + f"if (b) b.style.display='{disp}';") + self._webview.evaluateJavaScript_completionHandler_(js, None) + except Exception: + pass + + @staticmethod + def _js_escape(text): + return (str(text) + .replace("\\", "\\\\") + .replace("'", "\\'") + .replace("\n", " ") + .replace("\r", " ")) + + def _show_choice(self, left_label, right_label): + try: + if self._webview is None: + return + l = self._js_escape(left_label) + r = self._js_escape(right_label) + js = f"if (window.setChoice) setChoice('{l}', '{r}');" + self._webview.evaluateJavaScript_completionHandler_(js, None) + except Exception: + pass + + def _show_input(self, save_label): + try: + if self._webview is None: + return + s = self._js_escape(save_label) + js = f"if (window.setInput) setInput('{s}');" + self._webview.evaluateJavaScript_completionHandler_(js, None) + except Exception: + pass + + def _set_input_error(self, msg): + try: + if self._webview is None: + return + m = self._js_escape(msg or "") + js = f"if (window.setInputError) setInputError('{m}');" + self._webview.evaluateJavaScript_completionHandler_(js, None) + except Exception: + pass + + def _clear_extra_ui(self): + try: + if self._webview is None: + return + js = "if (window.clearAll) clearAll();" + self._webview.evaluateJavaScript_completionHandler_(js, None) + except Exception: + pass + + def _on_height_changed(self, requested_h): + """Resize the NSWindow to match the WebView's content height. + + Top edge stays put — height grows downward by adjusting NSWindow's + bottom-left origin Y. Clamped to [MIN_H, MAX_H]. + """ + try: + if self._window is None: + return + new_h = max(self.MIN_H, min(int(requested_h), self.MAX_H)) + if abs(new_h - self._current_h) < 1: + return + self._current_h = new_h + frame = self._window.frame() + # NSWindow origin is bottom-left; to keep top edge fixed while + # height changes, shift origin Y by (old_h - new_h). + new_y = frame.origin.y + frame.size.height - new_h + new_frame = NSMakeRect(frame.origin.x, new_y, frame.size.width, new_h) + self._window.setFrame_display_animate_(new_frame, True, True) + # The WebView resizes with the window via its autoresizingMask + # (set in _create), so no manual setFrame snap is needed here — + # snapping would override the in-flight animation and the pill + # would visually jump to its final size rather than morph. + except Exception as e: + logger.warning(f"banner: _on_height_changed failed ({e})") + + def _destroy(self): + try: + if self._webview is not None: + try: + self._webview.stopLoading() + except Exception: + pass + try: + cfg = self._webview.configuration() + if cfg is not None: + uc = cfg.userContentController() + uc.removeScriptMessageHandlerForName_("next_clicked") + uc.removeScriptMessageHandlerForName_("height_changed") + uc.removeScriptMessageHandlerForName_("choice_clicked") + uc.removeScriptMessageHandlerForName_("save_clicked") + uc.removeScriptMessageHandlerForName_("reveal_done") + except Exception: + pass + if self._window is not None: + self._window.orderOut_(None) + except Exception: + pass + finally: + for ev in (self._next_event, self._choice_event, + self._save_event, self._reveal_event): + try: + ev.set() + except Exception: + pass + self._window = None + self._webview = None + self._next_handler = None + self._height_handler = None + self._choice_handler = None + self._save_handler = None + self._reveal_handler = None diff --git a/Auto_Use/macOS_use/remote_connection/telegram/service.py b/Auto_Use/macOS_use/remote_connection/telegram/service.py new file mode 100644 index 0000000..d6e539b --- /dev/null +++ b/Auto_Use/macOS_use/remote_connection/telegram/service.py @@ -0,0 +1,847 @@ +# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# If you build on this project, please keep this header and credit +# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works. +# A small attribution goes a long way toward a healthy open-source +# community — thank you for contributing. + +"""Telegram → AgentService bridge with a guided provider/model picker. + +Runs as a standalone process (not mounted into Flask). On the first message +the bot asks you to pick a provider (limited to providers with a non-empty +key in api_key.txt / .env), then a model (from the same MODEL_MAPPINGS the +AutoUse frontend uses). Subsequent messages are dispatched as tasks to the +agent with that provider/model. Picked provider/model persist for the whole +chat session until you `/reset`. + +Token lookup order (first non-empty wins): + 1. TELEGRAM_BOT_TOKEN env var + 2. .env at the project root + 3. Auto_Use/api_key/api_key.txt + +Setup: + 1. @BotFather → /newbot → copy token. + 2. Paste it into .env OR api_key.txt as TELEGRAM_BOT_TOKEN=… + 3. Make sure at least one provider key (e.g. OPENROUTER_API_KEY=…) is set. + 4. python -m Auto_Use.macOS_use.remote_connection.telegram.service + 5. On phone: open Telegram, find your bot, send any message. +""" +import asyncio +import datetime +import importlib +import logging +import sys +import threading +from pathlib import Path + +from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup +from telegram.ext import ( + Application, + CommandHandler, + MessageHandler, + CallbackQueryHandler, + filters, +) + +logger = logging.getLogger(__name__) + +# The Telegram surface treats api_key.txt as its single source of truth — we +# deliberately do NOT consult .env or env vars here. .env is app.py's general +# env-loading concern; keeping the bot self-contained against api_key.txt +# avoids two-files-of-record confusion. +# +# Resolve api_key.txt the same way app.py's get_auto_use_path() does: in a +# compiled/frozen build __file__ points INSIDE the bundle, so the parents[4] +# walk would miss the editable api_key.txt that lives next to the executable +# (the one the Settings panel and the regular agent use). Fall back to the +# source-tree path in dev (python app.py). +_IS_COMPILED = getattr(sys, "frozen", False) or "__compiled__" in globals() +if _IS_COMPILED: + _API_KEY_FILE = Path(sys.executable).parent / "Auto_Use" / "api_key" / "api_key.txt" +else: + # service.py → telegram → remote_connection → macOS_use → Auto_Use → repo root + _API_KEY_FILE = ( + Path(__file__).resolve().parents[4] / "Auto_Use" / "api_key" / "api_key.txt" + ) + +# Agent writes per-step "milestone" lines here. We tail this file during a +# task and forward each new line back to the user's Telegram chat so they +# see the agent's progress in real time. +SCRATCHPAD_PATH = ( + Path(__file__).resolve().parents[2] / "scratchpad" / "milestone" / "milestone.md" +) +SCRATCHPAD_POLL_SEC = 2.0 +MAX_TG_MSG_LEN = 4000 # Telegram caps at 4096; leave headroom for safety + +# Provider id → API-key name in the KV files. Same mapping the Windows side +# uses ([windows_use/remote_connection/telegram/service.py:44-51]). +PROVIDER_KEY_MAP = { + "openrouter": "OPENROUTER_API_KEY", + "groq": "GROQ_API_KEY", + "openai": "OPENAI_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + "google": "GOOGLE_API_KEY", + "perplexity": "PERPLEXITY_API_KEY", +} + + +# ── file helpers ───────────────────────────────────────────────────────────── + +def _read_all_keys(path: Path) -> dict: + """Parse a simple KEY=VALUE file (one per line) into a dict. Skips empty + values and lines starting with '#'.""" + out = {} + if not path.exists(): + return out + try: + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, _, v = line.partition("=") + k, v = k.strip(), v.strip() + if v: + out[k] = v + except Exception: + pass + return out + + +def _resolve_token() -> str | None: + """Read TELEGRAM_BOT_TOKEN from api_key.txt only. .env and env vars are + intentionally ignored — see header comment.""" + return _read_all_keys(_API_KEY_FILE).get("TELEGRAM_BOT_TOKEN") + + +def _get_available_providers() -> list: + """Providers with a non-empty key in api_key.txt only.""" + keys = _read_all_keys(_API_KEY_FILE) + return [ + {"id": pid, "key": keys[kname]} + for pid, kname in PROVIDER_KEY_MAP.items() + if keys.get(kname) + ] + + +def _set_key_in_file(path: Path, key: str, value: str) -> None: + """Write/update KEY=value in a KV file, preserving every other line. + + Unlike a naive read-all-and-write-back-with-_read_all_keys, this keeps + empty-value placeholder lines (e.g. GROQ_API_KEY=) intact — the AutoUse + UI relies on those for its provider list rendering. + """ + lines = [] + found = False + if path.exists(): + try: + with open(path, "r", encoding="utf-8") as f: + for raw in f: + stripped = raw.strip() + if stripped.startswith(f"{key}="): + lines.append(f"{key}={value}\n") + found = True + else: + lines.append(raw if raw.endswith("\n") else raw + "\n") + except Exception: + logger.warning("failed to read %s while updating %s", path, key) + return + if not found: + lines.append(f"{key}={value}\n") + try: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + f.writelines(lines) + except Exception: + logger.warning("failed to write %s", path) + + +def _resolve_owner_chat_id() -> int | None: + """Owner chat_id = whoever last sent /start. Stored in api_key.txt as + TELEGRAM_OWNER_CHAT_ID=…, so it survives restarts.""" + val = _read_all_keys(_API_KEY_FILE).get("TELEGRAM_OWNER_CHAT_ID") + if not val: + return None + try: + return int(val) + except ValueError: + return None + + +def _save_owner_chat_id(chat_id: int) -> None: + """Persist the owner chat_id so we can message them on the next boot.""" + _set_key_in_file(_API_KEY_FILE, "TELEGRAM_OWNER_CHAT_ID", str(chat_id)) + + +def _get_models_for_provider(provider_id: str) -> list: + """Read MODEL_MAPPINGS from Auto_Use/macOS_use/llm_provider//view.py + and return non-hidden entries as [{id, display_name}, …].""" + try: + mod = importlib.import_module( + f"Auto_Use.macOS_use.llm_provider.{provider_id}.view" + ) + mappings = getattr(mod, "MODEL_MAPPINGS", {}) + return [ + {"id": mid, "display_name": info.get("display_name", mid)} + for mid, info in mappings.items() + if not info.get("hidden", False) + ] + except Exception: + return [] + + +# ── per-chat state ─────────────────────────────────────────────────────────── + +# chat_id → { +# "phase": "idle" | "pick_provider" | "pick_model" | "ready" | "running", +# "provider": str | None, +# "model": str | None, +# "model_display": str | None, +# "queue": list[str], # tasks waiting to run, FIFO +# "pending": dict[str, str], # pending_id → task awaiting Yes/No +# "pending_counter": int, # monotonic id source for pending +# } +_chat_state: dict = {} + +# Guards mutations that read+modify state across threads (queue drain races +# between _run_agent's finally and the callback handler tapping "Yes"). +_state_lock = threading.Lock() + + +def _state(chat_id: int) -> dict: + return _chat_state.setdefault(chat_id, {"phase": "idle"}) + + +def _maybe_run_next_queued(chat_id: int, bot, loop) -> None: + """If this chat is ready and has a queued task, pop the next one and + start it. Threadsafe — called from both _run_agent's finally (worker + thread) and the q+ callback (asyncio loop).""" + with _state_lock: + state = _chat_state.get(chat_id) + if not state: + return + if state.get("phase") != "ready": + return + queue = state.get("queue") or [] + if not queue: + return + provider = state.get("provider") + model = state.get("model") + if not provider or not model: + return + next_task = queue.pop(0) + display = state.get("model_display") or model + state["phase"] = "running" + + _send_chat( + bot, + chat_id, + f"📝 Running queued task: {next_task[:200]} ({provider} · {display})", + loop, + ) + threading.Thread( + target=_run_agent, + args=(next_task, provider, model, chat_id, bot, loop), + daemon=True, + name=f"telegram-agent-{chat_id}-queued", + ).start() + + +# ── Telegram handlers ──────────────────────────────────────────────────────── + +def _build_online_text(providers: list) -> str: + now_str = datetime.datetime.now().strftime("%H:%M:%S") + if providers: + provider_line = ", ".join(p["id"] for p in providers) + return f"🟢 AutoUse online at {now_str}\nProviders: {provider_line}" + return f"🟢 AutoUse online at {now_str}\nProviders: (none configured)" + + +async def _show_provider_picker(message): + providers = _get_available_providers() + # Always lead with the "AutoUse online" status line so the user gets the + # same greeting they'd see at app boot, even when they message the bot + # first instead of waiting for the unsolicited startup announcement. + await message.reply_text(_build_online_text(providers)) + if not providers: + await message.reply_text( + "⚠️ No provider API keys found. Add at least one (e.g. " + "OPENROUTER_API_KEY=…) to api_key.txt or .env and try again." + ) + return False + buttons = [ + [InlineKeyboardButton(p["id"], callback_data=f"provider:{p['id']}")] + for p in providers + ] + await message.reply_text( + "👋 Pick a provider:", reply_markup=InlineKeyboardMarkup(buttons) + ) + return True + + +async def _discover_owner_from_updates(bot) -> int | None: + """Peek at the latest pending update on Telegram's servers and use its + chat_id as the owner. Lets the bot self-bootstrap on the very first run + after the chat-saving code was deployed, without requiring the user to + /start again. Safe to call before start_polling — uses offset=-1 which + Telegram supports as 'just the most recent update', and doesn't consume + updates from the polling updater's offset cursor.""" + try: + updates = await bot.get_updates(offset=-1, limit=1, timeout=2) + except Exception: + logger.warning("owner discovery: get_updates failed", exc_info=True) + return None + for upd in updates: + chat = getattr(upd, "effective_chat", None) + if chat and chat.id: + return int(chat.id) + return None + + +async def _post_init(application) -> None: + """Fires once after the bot finishes initialising (before polling starts). + Used to message the saved owner: 'AutoUse online at …' + a fresh provider + picker — so the user doesn't have to send anything to get going.""" + owner_id = _resolve_owner_chat_id() + if not owner_id: + # Not saved yet — try to auto-discover from Telegram's pending updates. + # Works if the user has ever messaged the bot, even before the + # chat-saving code was deployed. Persist the result so we don't need + # to re-discover on every boot. + owner_id = await _discover_owner_from_updates(application.bot) + if owner_id: + try: + _save_owner_chat_id(owner_id) + logger.info( + "owner discovery: saved chat_id=%s from getUpdates", + owner_id, + ) + except Exception: + logger.warning("owner discovery: could not persist chat_id", exc_info=True) + if not owner_id: + # No owner anywhere — they've never interacted with the bot. Stay + # silent; they'll register themselves with /start. + return + bot = application.bot + providers = _get_available_providers() + try: + await bot.send_message(chat_id=owner_id, text=_build_online_text(providers)) + except Exception: + logger.exception("startup announcement: failed to send hello") + return # if we can't even greet, don't bother with the picker + + if not providers: + try: + await bot.send_message( + chat_id=owner_id, + text="⚠️ No provider API keys found. Add at least one to api_key.txt and /reset.", + ) + except Exception: + pass + return + + buttons = [ + [InlineKeyboardButton(p["id"], callback_data=f"provider:{p['id']}")] + for p in providers + ] + try: + await bot.send_message( + chat_id=owner_id, + text="👋 Pick a provider:", + reply_markup=InlineKeyboardMarkup(buttons), + ) + # Park the owner's chat in pick_provider so the next button tap routes + # cleanly through the existing callback flow. + _chat_state[owner_id] = {"phase": "pick_provider"} + except Exception: + logger.exception("startup announcement: failed to send provider picker") + + +async def start_cmd(update, ctx): + chat_id = update.effective_chat.id + # Remember this chat so future boots can auto-greet (Phase 10 startup + # announcement). Best-effort — never let a file-write failure block /start. + try: + _save_owner_chat_id(chat_id) + except Exception: + logger.warning("could not persist owner chat_id", exc_info=True) + _chat_state[chat_id] = {"phase": "pick_provider"} + ok = await _show_provider_picker(update.message) + if not ok: + _chat_state[chat_id] = {"phase": "idle"} + + +async def reset_cmd(update, ctx): + # Wipe state for this chat — including any queued tasks and pending + # awaiting Yes/No prompts. We do NOT clear the persisted owner chat_id; + # /reset is "start over the conversation", not "forget I exist". + _chat_state[update.effective_chat.id] = {"phase": "idle"} + await update.message.reply_text( + "🔄 Reset. Send any message to pick a provider again." + ) + + +async def text_handler(update, ctx): + chat_id = update.effective_chat.id + # Persist on every message, not just /start, so the next app boot can + # auto-announce "AutoUse online" without the user having to /start first. + try: + _save_owner_chat_id(chat_id) + except Exception: + logger.warning("could not persist owner chat_id", exc_info=True) + state = _state(chat_id) + phase = state.get("phase", "idle") + + if phase in ("idle", "pick_provider"): + state["phase"] = "pick_provider" + ok = await _show_provider_picker(update.message) + if not ok: + state["phase"] = "idle" + return + + if phase == "pick_model": + await update.message.reply_text( + "Pick a model from the buttons above first." + ) + return + + if phase == "running": + # Busy — offer to queue this task. Each pending prompt gets a unique + # id so multiple "queue this?" prompts can coexist if the user spams. + task = (update.message.text or "").strip() + if not task: + return + state.setdefault("pending", {}) + state["pending_counter"] = state.get("pending_counter", 0) + 1 + pending_id = str(state["pending_counter"]) + state["pending"][pending_id] = task + buttons = [[ + InlineKeyboardButton("✅ Yes, queue it", callback_data=f"q+:{pending_id}"), + InlineKeyboardButton("❌ No", callback_data=f"q-:{pending_id}"), + ]] + await update.message.reply_text( + f"⏳ Currently busy performing a task.\n" + f"Do you want to queue: \"{task[:200]}\" ?", + reply_markup=InlineKeyboardMarkup(buttons), + ) + return + + # phase == "ready" + task = (update.message.text or "").strip() + if not task: + return + state["phase"] = "running" + provider = state["provider"] + model = state["model"] + display = state.get("model_display", model) + await update.message.reply_text( + f"📝 Running: {task} ({provider} · {display})" + ) + bot = ctx.bot + loop = asyncio.get_running_loop() + threading.Thread( + target=_run_agent, + args=(task, provider, model, chat_id, bot, loop), + daemon=True, + ).start() + + +async def callback_handler(update, ctx): + query = update.callback_query + await query.answer() + chat_id = query.message.chat_id + try: + _save_owner_chat_id(chat_id) + except Exception: + logger.warning("could not persist owner chat_id", exc_info=True) + state = _state(chat_id) + data = query.data or "" + + if data.startswith("provider:"): + provider_id = data.split(":", 1)[1] + state["provider"] = provider_id + state["phase"] = "pick_model" + models = _get_models_for_provider(provider_id) + if not models: + state["phase"] = "pick_provider" + await query.edit_message_text( + f"⚠️ No models found for {provider_id}. Pick another provider." + ) + return + buttons = [ + [InlineKeyboardButton(m["display_name"], callback_data=f"model:{m['id']}")] + for m in models + ] + await query.edit_message_text( + f"Pick a model for {provider_id}:", + reply_markup=InlineKeyboardMarkup(buttons), + ) + return + + if data.startswith("model:"): + model_id = data.split(":", 1)[1] + provider_id = state.get("provider") + if not provider_id: + state["phase"] = "idle" + await query.edit_message_text("Session expired. Send any message to start over.") + return + models = _get_models_for_provider(provider_id) + display = next( + (m["display_name"] for m in models if m["id"] == model_id), model_id + ) + state["model"] = model_id + state["model_display"] = display + state["phase"] = "ready" + await query.edit_message_text( + f"✅ Provider: {provider_id} / Model: {display}\n" + f"Send me a task whenever you're ready." + ) + return + + if data.startswith("q+:"): + # User wants to queue the pending task. + pending_id = data.split(":", 1)[1] + task = (state.get("pending") or {}).pop(pending_id, None) + if not task: + await query.edit_message_text("(That prompt has already been handled.)") + return + state.setdefault("queue", []).append(task) + qlen = len(state["queue"]) + await query.edit_message_text( + f"📥 Queued (position {qlen}): \"{task[:200]}\"\n" + f"Will run when the current task finishes." + ) + # Edge case: agent finished in the milliseconds between the prompt + # being sent and the user tapping Yes. Drain the queue now so the + # queued task isn't stranded. + _maybe_run_next_queued(chat_id, ctx.bot, asyncio.get_running_loop()) + return + + if data.startswith("q-:"): + # User declines to queue. Drop the pending task. + pending_id = data.split(":", 1)[1] + (state.get("pending") or {}).pop(pending_id, None) + await query.edit_message_text( + "👍 OK, won't queue it. I'll let you know once the current task is done." + ) + return + + +# ── scratchpad streaming ───────────────────────────────────────────────────── + +def _send_chat(bot, chat_id, text, loop, wait: bool = False, timeout: float = 5.0): + """Schedule a bot.send_message on the asyncio loop from a worker thread. + Silently ignores failures so a transient send error never kills the + monitor thread. + + When wait=True, block the calling thread until the send actually + completes (or `timeout` seconds elapse). Used for terminal messages + like "✅ Done." that must land in the chat BEFORE the next message + is scheduled — without it, the "Done" send and the "Running queued + task" send race inside the asyncio loop as two parallel HTTP POSTs + and Telegram can deliver them out of order.""" + try: + fut = asyncio.run_coroutine_threadsafe( + bot.send_message(chat_id=chat_id, text=text), loop + ) + if wait: + try: + fut.result(timeout=timeout) + except Exception: + logger.warning( + "send_message to chat %s did not confirm within %ss", + chat_id, timeout, exc_info=True, + ) + except Exception: + logger.warning("Failed to schedule send_message to chat %s", chat_id) + + +def _monitor_scratchpad(chat_id, bot, loop, stop_event, start_pos): + """Tail SCRATCHPAD_PATH and forward each new non-empty line to the chat. + + Polls every SCRATCHPAD_POLL_SEC seconds. start_pos is the byte offset + the file was at when the task began — we only forward content written + AFTER that, so old milestones from previous tasks aren't replayed. + Exits when stop_event is set, after one final sweep to flush any tail. + """ + last_pos = start_pos + + def _read_and_forward(): + nonlocal last_pos + if not SCRATCHPAD_PATH.exists(): + # File was deleted (e.g. AgentService.__init__ wiping the + # scratchpad). Reset so the next poll re-reads the whole new + # file from the top instead of seeking past its end. + last_pos = 0 + return + try: + # Defensive: if the file shrank below last_pos it was truncated + # or rotated; restart from byte 0 so we don't slice into the + # middle of fresh content and stream a fragment. + try: + current_size = SCRATCHPAD_PATH.stat().st_size + if current_size < last_pos: + last_pos = 0 + except Exception: + pass + with open(SCRATCHPAD_PATH, "r", encoding="utf-8", errors="replace") as f: + f.seek(last_pos) + new_content = f.read() + if not new_content: + return + last_pos = f.tell() + except Exception as exc: + logger.warning("Scratchpad read error: %s", exc) + return + for raw in new_content.splitlines(): + line = raw.strip() + if not line: + continue + # Chunk excessively long lines so we stay under Telegram's 4096 cap. + for i in range(0, len(line), MAX_TG_MSG_LEN): + _send_chat(bot, chat_id, line[i : i + MAX_TG_MSG_LEN], loop) + + while not stop_event.is_set(): + _read_and_forward() + stop_event.wait(SCRATCHPAD_POLL_SEC) + + # Final sweep — catches any line written between the last poll and the + # stop_event being set (e.g. the agent's very last milestone). + _read_and_forward() + + +# ── agent runner (worker thread) ───────────────────────────────────────────── + +def _run_agent(task, provider, model, chat_id, bot, loop): + """Run the agent and ping the chat when done. Streams scratchpad milestones + back to the chat live while the agent works. Pops a compact pill so the + Mac user can see a Telegram task is running, and minimises the main app + window so the agent has the screen to itself. Restores phase to 'ready'.""" + # Compact "Telegram task in progress" indicator + minimise AutoUse window. + # Both are best-effort — never let UI fluff block the actual task. + from Auto_Use.macOS_use.remote_connection.telegram.banner import StatusBanner + task_banner = StatusBanner(compact=True) + try: + task_banner.show() + except Exception: + logger.warning("could not show task banner", exc_info=True) + # Minimise the AutoUse pywebview window so the agent has the screen to + # itself. We talk to pywebview directly via its global `windows` list + # rather than importing from app.py — `python app.py` makes app.py the + # __main__ module, so `from app import …` would re-import a *second* + # copy of app.py whose webview_window is still None, and the call would + # silently no-op. + try: + import webview as _webview + if _webview.windows: + _webview.windows[0].minimize() + except Exception: + logger.warning("could not minimise AutoUse window", exc_info=True) + + # Reset the milestone scratchpad to empty before starting the monitor. + # AgentService.__init__ wipes the entire scratchpad/ directory in + # _cleanup_scratchpad() — so if we snapshotted the file's current size + # here and the agent then deleted + rewrote it, the monitor's last_pos + # would point mid-way into the fresh content and we'd stream a + # fragment (e.g. "ome." instead of "Verified: …Chrome.") to the chat. + # Deleting the file ourselves up front and starting from byte 0 keeps + # the monitor aligned with whatever the agent writes next. Best-effort + # — a failure here just degrades us back to the old (buggy) behavior. + try: + if SCRATCHPAD_PATH.exists(): + SCRATCHPAD_PATH.unlink() + except Exception: + logger.warning("could not reset milestone scratchpad", exc_info=True) + start_pos = 0 + stop_event = threading.Event() + monitor = threading.Thread( + target=_monitor_scratchpad, + args=(chat_id, bot, loop, stop_event, start_pos), + daemon=True, + name=f"telegram-scratchpad-{chat_id}", + ) + monitor.start() + + try: + # Imported lazily — pulls in tree/element → skimage etc., which we + # don't want to load until a task actually runs. + from Auto_Use.macOS_use.agent.service import AgentService + + # Look up the runtime API key for the chosen provider so LLMManager + # doesn't fall back to an os.getenv() the user never set. Telegram + # users edit api_key.txt (or the AutoUse Settings panel), not env + # vars — and the compiled build has no .env — so without passing + # api_key= here the agent dies with "X API key not provided and not + # found in .env file". _get_available_providers already gated the + # picker to non-empty keys, so this lookup returns a value. + provider_key_name = PROVIDER_KEY_MAP.get(provider) + provider_keys = _read_all_keys(_API_KEY_FILE) + provider_api_key = ( + provider_keys.get(provider_key_name) if provider_key_name else None + ) + + agent = AgentService( + provider=provider, + model=model, + save_conversation=False, + thinking=True, + api_key=provider_api_key, + ) + agent.process_request(task) + # Stop the monitor BEFORE the done message so the final scratchpad + # sweep happens first — keeps the chat in correct chronological order. + stop_event.set() + monitor.join(timeout=SCRATCHPAD_POLL_SEC + 2) + # wait=True: block until "✅ Done." is on Telegram's servers before + # the finally-block fires _maybe_run_next_queued, which would + # otherwise schedule "📝 Running queued task: …" as a second, + # concurrent HTTP POST that can race past Done in delivery. + _send_chat(bot, chat_id, "✅ Done.", loop, wait=True) + except Exception as e: + logger.exception("agent error") + stop_event.set() + monitor.join(timeout=SCRATCHPAD_POLL_SEC + 2) + _send_chat(bot, chat_id, f"❌ Error: {e}", loop, wait=True) + finally: + if not stop_event.is_set(): + stop_event.set() + try: + task_banner.close() + except Exception: + pass + with _state_lock: + state = _chat_state.get(chat_id) + if state is not None and state.get("phase") == "running": + state["phase"] = "ready" + # Drain one queued task if any — keeps phase='running' if it spawns. + _maybe_run_next_queued(chat_id, bot, loop) + + +# ── entry points ───────────────────────────────────────────────────────────── + +def _build_telegram_app(token: str): + """Build a python-telegram-bot Application with all our handlers wired. + + `post_init` is the hook python-telegram-bot calls once after the bot + finishes initialising but before polling starts — perfect spot to send + the "AutoUse online" announcement + provider picker to the saved owner. + """ + app = ( + Application.builder() + .token(token) + .post_init(_post_init) + .build() + ) + app.add_handler(CommandHandler("start", start_cmd)) + app.add_handler(CommandHandler("reset", reset_cmd)) + app.add_handler(CallbackQueryHandler(callback_handler)) + app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, text_handler)) + return app + + +_BOT_THREAD: threading.Thread | None = None + + +def _stderr(msg: str) -> None: + """Loud print to the terminal where python app.py is running — bypasses + whatever logging config is in effect so the user actually sees it.""" + import sys + print(f"[telegram] {msg}", file=sys.stderr, flush=True) + + +async def _run_bot_until_stopped(tg_app): + """Manual lifecycle replacement for Application.run_polling(). + + run_polling() messes with signals and assumes it owns the main thread; + we want to drive it from a worker thread so we do it step by step. + + Order matches what run_polling() does internally: + initialize → start → post_init → start_polling. + We call _post_init BEFORE start_polling so its bot.get_updates(offset=-1) + auto-discovery doesn't race with the updater's own polling loop. + """ + await tg_app.initialize() + await tg_app.start() + # Application.post_init() is only invoked by run_polling(), not by the + # manual initialize+start path above. Call our startup announcement + # explicitly so the saved owner gets the "AutoUse online" message. + try: + await _post_init(tg_app) + except Exception: + logger.exception("post_init failed") + await tg_app.updater.start_polling(allowed_updates=Update.ALL_TYPES) + _stderr("polling loop is live — send your bot a message") + # Park here forever (daemon thread; killed on app exit). + await asyncio.Event().wait() + + +def start_bot() -> None: + """Start the Telegram bot polling on a daemon thread. + + Idempotent — safe to call multiple times from app.py boot. Prints loudly + to stderr at each milestone so the user can see what's happening. + """ + global _BOT_THREAD + if _BOT_THREAD is not None and _BOT_THREAD.is_alive(): + _stderr("start_bot() called but the bot is already running — skipping.") + return + token = _resolve_token() + if not token: + _stderr( + "BOT NOT STARTED — TELEGRAM_BOT_TOKEN not found in env, .env, or " + "api_key.txt. Paste your @BotFather token into one of those files." + ) + return + _stderr(f"starting bot (token ends in …{token[-6:]})") + + def _runner(): + import sys, traceback + try: + # Each thread needs its own asyncio event loop. Without this, the + # call to asyncio.Event() inside _run_bot_until_stopped fails. + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + tg_app = _build_telegram_app(token) + try: + loop.run_until_complete(_run_bot_until_stopped(tg_app)) + finally: + loop.close() + except Exception as e: + _stderr(f"BOT CRASHED: {e!r}") + traceback.print_exc(file=sys.stderr) + + _BOT_THREAD = threading.Thread(target=_runner, daemon=True, name="telegram-bot") + _BOT_THREAD.start() + + +def main(): + """Standalone entry — for testing without launching the full AutoUse app.""" + token = _resolve_token() + if not token: + raise SystemExit( + f"TELEGRAM_BOT_TOKEN not found in {_API_KEY_FILE}\n" + "(create the bot via @BotFather first, then add the token to that file)." + ) + tg_app = _build_telegram_app(token) + logger.info("Telegram bot polling started (main thread)") + tg_app.run_polling(allowed_updates=Update.ALL_TYPES) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + main() diff --git a/Auto_Use/macOS_use/remote_connection/telegram/setup.py b/Auto_Use/macOS_use/remote_connection/telegram/setup.py new file mode 100644 index 0000000..7d0a395 --- /dev/null +++ b/Auto_Use/macOS_use/remote_connection/telegram/setup.py @@ -0,0 +1,154 @@ +# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# If you build on this project, please keep this header and credit +# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works. +# A small attribution goes a long way toward a healthy open-source +# community — thank you for contributing. + +"""Telegram remote-connection setup driver (macOS, guided mode). + +Opens Safari, navigates to web.telegram.org, then lets the user log in +manually. Progress is paced by a small always-on-top banner that streams +status text and has a Next button. The script blocks on user clicks via +banner.wait_for_next() — the user does the actual login (phone, country, +OTP) themselves; we just get them to the right page. +""" +import logging +import os +import time + +from Auto_Use.macOS_use.controller.tool.open_app import open_app +from Auto_Use.macOS_use.tree.element import UIElementScanner, ELEMENT_CONFIG +from Auto_Use.macOS_use.controller.service import ControllerService +from Auto_Use.macOS_use.controller.key_combo.service import KeyComboService +from Auto_Use.macOS_use.remote_connection.telegram.banner import StatusBanner +from Auto_Use.macOS_use.remote_connection.telegram.service import ( + _API_KEY_FILE, _set_key_in_file, +) + +logger = logging.getLogger(__name__) + +TELEGRAM_WEB_URL = "web.telegram.org" +STEP_DELAY_SEC = 2 + + +def _find_address_bar(mapping: dict) -> str | None: + """Return the index of Safari's smart-search field, or None if not found.""" + for idx, info in mapping.items(): + if info.get("name") == "smart search field" and info.get("type") == "TextField": + return idx + return None + + +def _open_telegram_in_safari(banner) -> bool: + """Launch Safari and navigate it to web.telegram.org. + + Streams sub-step status to the banner so the user can see what's happening + while Safari takes focus. Returns False on any failure. + """ + banner.update("Please wait — confirming Safari is open…") + if not open_app("Safari"): + logger.error("setup.py: failed to launch Safari") + return False + # open_app itself sleeps ~1 s after launching and then runs an AppleScript + # window-move, so the address bar isn't reliably there yet. One more + # second is enough for the smart-search field to settle before we scan. + time.sleep(1) + + scanner = UIElementScanner(ELEMENT_CONFIG) + scanner.scan_elements() + mapping = scanner.get_elements_mapping() + time.sleep(STEP_DELAY_SEC) + + address_bar_index = _find_address_bar(mapping) + if address_bar_index is None: + logger.error("setup.py: Safari address bar not found in scan") + return False + + banner.update("Safari detected. Writing the URL for you, please wait…") + + controller = ControllerService() + controller.set_elements(mapping, scanner.application_name) + key_combo = KeyComboService() + + controller.click(address_bar_index) + time.sleep(STEP_DELAY_SEC) + + controller.canvas_input(TELEGRAM_WEB_URL) + time.sleep(STEP_DELAY_SEC) + + key_combo.send("return") + return True + + +def run(country_code: str = "", phone: str = "") -> bool: + """Guided Telegram-Web pairing. + + Shows a banner, waits for the user to click Next, opens Telegram Web, + waits for the user to log in manually + click Next, then closes. + + country_code and phone are accepted but ignored — kept only so the + pre-existing /api/telegram/connect callsite signature still works. + """ + banner = StatusBanner() + banner.show() + try: + banner.update("Let's get you set up with Telegram. Please click Next.") + banner.wait_for_next() + + if not _open_telegram_in_safari(banner): + banner.update("Failed to open Telegram. Close this banner and try again.") + banner.wait_for_next(timeout=15) + return False + + banner.update("Please log in to Telegram, then click Next") + banner.wait_for_next() + + banner.update( + "Now search for @BotFather in Telegram and open the chat. " + "Click Next when you're there." + ) + banner.wait_for_next() + + banner.update("How do you want to set up the bot?") + choice = banner.wait_for_choice("Fresh setup", "Token already generated") + + if choice == "left": + banner.update( + "In @BotFather, send these one at a time: /newbot → AutoUse → " + "a unique bot name. BotFather will reply with your token. " + "Click Next when you have it." + ) + banner.wait_for_next() + + banner.update("Paste your BotFather token below and click Save.") + token = banner.wait_for_input(save_label="Save") + if not token: + return False # Cocoa-unavailable fallback; banner never appeared + + _set_key_in_file(_API_KEY_FILE, "TELEGRAM_BOT_TOKEN", token.strip()) + + banner.update("Saved. Restarting AutoUse to start the bot…") + # Give the message time to stream out + a beat for the user to read + # it, then hard-exit the whole process. The user's next `python + # app.py` boot picks up the fresh TELEGRAM_BOT_TOKEN and the bot + # comes online with the saved owner chat. os._exit skips atexit / + # finally cleanup, which is what we want — Cocoa will tear down + # the banner + windows as the process dies. + time.sleep(3) + banner.close() + os._exit(0) + finally: + banner.close() diff --git a/Auto_Use/macOS_use/remote_connection/telegram/view.py b/Auto_Use/macOS_use/remote_connection/telegram/view.py new file mode 100644 index 0000000..5eb5758 --- /dev/null +++ b/Auto_Use/macOS_use/remote_connection/telegram/view.py @@ -0,0 +1,157 @@ +# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# If you build on this project, please keep this header and credit +# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works. +# A small attribution goes a long way toward a healthy open-source +# community — thank you for contributing. + +"""Flask Blueprint for the macOS Telegram surface. + +Lives in the telegram folder so all Telegram-related code stays here — app.py +just imports `telegram_bp` and calls `app.register_blueprint(...)`. Routes: + + GET /api/telegram/status → {connected, bot_username?} + POST /api/telegram/connect → kicks off the Phase 4 guided walkthrough + POST /api/telegram/disconnect → clears the persisted token + +All token lookups read ONLY from api_key.txt. We deliberately do NOT consult +.env — that file is app.py's general env-loading concern; the Telegram bot +treats api_key.txt as its single source of truth. +""" +import json +import logging +import threading +import urllib.request + +from flask import Blueprint, jsonify + +logger = logging.getLogger(__name__) + +telegram_bp = Blueprint("telegram_macos", __name__) + +# Single source of truth for the key-file path — service.py resolves it in a +# compiled-build-aware way (next to the executable when frozen). Importing it +# here keeps the picker/status/disconnect routes pointed at the same file the +# bot and the regular agent read. +from .service import _API_KEY_FILE + +_bot_username_cache: str | None = None + + +def _read_token() -> str | None: + """Pull TELEGRAM_BOT_TOKEN out of api_key.txt. Returns None if missing or + empty. Does NOT consult .env or env vars on purpose.""" + if not _API_KEY_FILE.exists(): + return None + try: + with open(_API_KEY_FILE, "r", encoding="utf-8") as f: + for line in f: + stripped = line.strip() + if stripped.startswith("TELEGRAM_BOT_TOKEN="): + val = stripped.partition("=")[2].strip() + return val or None + except Exception: + logger.warning("could not read %s", _API_KEY_FILE) + return None + + +def _set_token(value: str) -> None: + """Write/clear TELEGRAM_BOT_TOKEN= in api_key.txt, preserving every other + line (incl. empty-value placeholders the AutoUse UI relies on).""" + lines = [] + found = False + if _API_KEY_FILE.exists(): + try: + with open(_API_KEY_FILE, "r", encoding="utf-8") as f: + for raw in f: + if raw.strip().startswith("TELEGRAM_BOT_TOKEN="): + lines.append(f"TELEGRAM_BOT_TOKEN={value}\n") + found = True + else: + lines.append(raw if raw.endswith("\n") else raw + "\n") + except Exception: + logger.warning("could not read %s while updating token", _API_KEY_FILE) + return + if not found: + lines.append(f"TELEGRAM_BOT_TOKEN={value}\n") + try: + _API_KEY_FILE.parent.mkdir(parents=True, exist_ok=True) + with open(_API_KEY_FILE, "w", encoding="utf-8") as f: + f.writelines(lines) + except Exception: + logger.warning("could not write %s", _API_KEY_FILE) + + +def _fetch_bot_username(token: str) -> str | None: + """One-shot call to Telegram's getMe — used by /status so the panel can + show '@your_bot' instead of just 'connected'.""" + try: + resp = urllib.request.urlopen( + f"https://api.telegram.org/bot{token}/getMe", timeout=5 + ) + data = json.loads(resp.read()) + if data.get("ok"): + return data["result"].get("username", "") or None + except Exception: + pass + return None + + +# ── routes ────────────────────────────────────────────────────────────────── + +@telegram_bp.route("/api/telegram/status", methods=["GET"]) +def telegram_status(): + """Frontend uses this to decide which Remote Connection panel state to + show. If a token is present in api_key.txt → 'connected', and the panel + flips to the @bot_username + Disconnect view (Connect button is hidden). + Cached so we don't hit Telegram's API on every page load.""" + global _bot_username_cache + token = _read_token() + if not token: + _bot_username_cache = None + return jsonify({"connected": False}) + if _bot_username_cache is None: + _bot_username_cache = _fetch_bot_username(token) or "" + return jsonify({ + "connected": True, + "bot_username": _bot_username_cache, + }) + + +@telegram_bp.route("/api/telegram/connect", methods=["POST"]) +def telegram_connect(): + """Kick off the Phase 4 guided walkthrough (Safari → web.telegram.org → + user logs in manually, paced by the floating banner). Returns immediately; + the real work runs on a daemon thread since it blocks on user clicks.""" + try: + from Auto_Use.macOS_use.remote_connection.telegram.setup import ( + run as run_telegram_setup, + ) + threading.Thread(target=run_telegram_setup, daemon=True).start() + return jsonify({"status": "started"}) + except Exception as e: + logger.exception("telegram_connect failed") + return jsonify({"status": "error", "message": str(e)}), 500 + + +@telegram_bp.route("/api/telegram/disconnect", methods=["POST"]) +def telegram_disconnect(): + """Clear the persisted token + the cached @bot_username. The polling + thread already running keeps polling until the next app restart (soft + disconnect) — clean shutdown of the bot loop is a future enhancement.""" + global _bot_username_cache + _set_token("") + _bot_username_cache = None + return jsonify({"status": "disconnected"}) diff --git a/Auto_Use/macOS_use/tree/element.py b/Auto_Use/macOS_use/tree/element.py index 09c54c6..8af9a88 100644 --- a/Auto_Use/macOS_use/tree/element.py +++ b/Auto_Use/macOS_use/tree/element.py @@ -58,6 +58,7 @@ from ApplicationServices import ( AXUIElementCreateSystemWide, AXUIElementCreateApplication, AXUIElementCopyAttributeValue, AXUIElementSetAttributeValue, + AXUIElementGetPid, AXIsProcessTrusted, kAXErrorSuccess, ) @@ -530,21 +531,47 @@ def _point_in_rect(px, py, rect): and rect["y"] <= py <= rect["y"] + rect["height"]) -def _ancestor_clipped_visibility(frame, ancestors, screen, window_clip=None): +def _ancestor_clipped_visibility(frame, ancestors, screen, window_clip=None, + scroll_clip=None): """Bottom-up visibility check — mirrors Windows _get_clipping_ancestors. - Returns (visibility_str, visible_rect_dict_or_None).""" + Returns (visibility_str, visible_rect_dict_or_None). + + `ancestors` may be a list of frame dicts (legacy callers) or + `(frame, role)` tuples; tuple form lets us recognise scroll containers + and skip the fixed/sticky safety-net for them. + + `scroll_clip`, when provided, is the innermost scrollable container's + viewport rect. Elements outside it are scroll-clipped — strictly hidden, + no safety-net. + """ visible = dict(frame) + if scroll_clip is not None: + inter = _rect_intersect(visible, scroll_clip) + if inter is None: + return "hidden", None + visible = inter + for anc in ancestors: if anc is None: continue - if anc["width"] < 50 or anc["height"] < 50: + if isinstance(anc, tuple): + anc_frame, anc_role = anc + if anc_frame is None: + continue + else: + anc_frame, anc_role = anc, None + if anc_frame["width"] < 50 or anc_frame["height"] < 50: continue - inter = _rect_intersect(visible, anc) + inter = _rect_intersect(visible, anc_frame) if inter is None: - anc_on_screen = _rect_intersect(anc, screen) is not None - anc_large = anc["width"] >= 100 and anc["height"] >= 100 + # Scroll containers are authoritative — if the element's frame is + # outside the viewport, it really is scrolled out. Don't bypass. + if anc_role in CLIP_ROLES: + return "hidden", None + anc_on_screen = _rect_intersect(anc_frame, screen) is not None + anc_large = anc_frame["width"] >= 100 and anc_frame["height"] >= 100 if anc_on_screen and anc_large: # Safety net for CSS position:fixed / sticky elements — # their AX parent frames may not encompass them even though @@ -614,9 +641,17 @@ def walk(element, results, depth, screen, clip=None, parent_frame=None, if frame and frame["width"] > 0 and frame["height"] > 0: label = build_label(element, cfg) if label: - vis_str, vis_rect = _ancestor_clipped_visibility(frame, ancestors, screen, window_clip) + vis_str, vis_rect = _ancestor_clipped_visibility( + frame, ancestors, screen, window_clip, + scroll_clip=clip) if vis_str != "hidden": + try: + err, elem_pid = AXUIElementGetPid(element, None) + if err != kAXErrorSuccess: + elem_pid = 0 + except Exception: + elem_pid = 0 results.append({ "type": role_str, "label": label, @@ -628,9 +663,12 @@ def walk(element, results, depth, screen, clip=None, parent_frame=None, "visibility": vis_str, "visible_rect_raw": vis_rect, "ax_element": element, + "_window_frame": window_clip, + "_pid": elem_pid, }) - child_ancestors = ancestors + [my_frame] if my_frame else ancestors + my_entry = (my_frame, role_str) if my_frame else None + child_ancestors = ancestors + [my_entry] if my_entry else ancestors children = ax_attr(element, "AXChildren") if children: try: @@ -698,17 +736,145 @@ def _find_topmost_app_on_screen(screen): return topmost, window_stack -def _is_occluded(element, allowed_pids, window_stack): - """Check if element is behind another app's window.""" - cx = element["x"] + element["width"] / 2 - cy = element["y"] + element["height"] / 2 - for win in window_stack: - if _point_in_rect(cx, cy, win["frame"]): - if win["pid"] in allowed_pids: - return False - else: - return True - return False +def _build_full_occluder_stack(screen): + """Front-to-back list of every on-screen window (all layers). + + Each entry: {pid, name, frame, window_id, layer}. Skips Window Server + and Dock; skips off-screen and tiny windows.""" + flags = kCGWindowListOptionOnScreenOnly | kCGWindowListExcludeDesktopElements + wins = CGWindowListCopyWindowInfo(flags, kCGNullWindowID) + skip_owners = {"Window Server", "Dock"} + stack = [] + if not wins: + return stack + for w in wins: + owner = w.get("kCGWindowOwnerName", "") + if owner in skip_owners: + continue + bounds = w.get("kCGWindowBounds") + if not bounds: + continue + ww = bounds.get("Width", 0) + wh = bounds.get("Height", 0) + if ww < 50 or wh < 50: + continue + wx = bounds.get("X", 0) + wy = bounds.get("Y", 0) + if not _rect_overlaps(wx, wy, ww, wh, screen): + continue + stack.append({ + "pid": w.get("kCGWindowOwnerPID", 0), + "name": owner, + "frame": {"x": wx, "y": wy, "width": ww, "height": wh}, + "window_id": w.get("kCGWindowNumber", 0), + "layer": w.get("kCGWindowLayer", 0), + }) + return stack + + +def _apply_window_occlusion(results, screen): + """Recompute per-element visibility against the real on-screen window + z-order. Drops elements whose visible area is effectively zero. + + Elements without a known owning window (e.g. menu-bar walk results that + lack `_window_frame`) are left untouched.""" + full_stack = _build_full_occluder_stack(screen) + if not full_stack: + return results + + # Cache: (pid, (x, y, w, h)) -> owning index in full_stack + owning_cache = {} + + def _owning_index(pid, win_frame): + if win_frame is None or not pid: + return -1 + key = (pid, win_frame["x"], win_frame["y"], + win_frame["width"], win_frame["height"]) + if key in owning_cache: + return owning_cache[key] + best = -1 + for i, w in enumerate(full_stack): + if w["pid"] != pid: + continue + wf = w["frame"] + if (abs(wf["x"] - win_frame["x"]) < 20 + and abs(wf["y"] - win_frame["y"]) < 20 + and abs(wf["width"] - win_frame["width"]) < 20 + and abs(wf["height"] - win_frame["height"]) < 20): + best = i + break + owning_cache[key] = best + return best + + out = [] + for e in results: + win_frame = e.get("_window_frame") + pid = e.get("_pid") + idx = _owning_index(pid, win_frame) + if idx < 0: + # Unknown owning window (menu-bar walk, dock) — leave as-is. + out.append(e) + continue + + elem_rect = {"x": e["x"], "y": e["y"], + "width": e["width"], "height": e["height"]} + occluders = [] + for w in full_stack[:idx]: + if w["window_id"] and w["window_id"] == full_stack[idx].get("window_id"): + continue + inter = _rect_intersect(elem_rect, w["frame"]) + if inter is not None: + occluders.append(w["frame"]) + + if not occluders: + out.append(e) + continue + + frac = _visible_fraction_after_occluders(elem_rect, occluders) + # Combine with walk-time clipping fraction. + vr = e.get("visible_rect_raw") + if vr: + walk_frac = (vr["width"] * vr["height"]) / max( + 1, elem_rect["width"] * elem_rect["height"]) + else: + walk_frac = 1.0 + final = walk_frac * frac + + if final < 0.01: + continue # drop fully-occluded + if final >= 0.99: + e["visibility"] = "full" + else: + e["visibility"] = f"partial {int(final * 100)}%" + out.append(e) + + return out + + +def _visible_fraction_after_occluders(rect, occluder_rects, samples=20): + """Return uncovered-area fraction of rect (0.0..1.0) using a grid sample. + + `occluder_rects` is a list of rect dicts that paint on top of `rect`. + A grid point is "covered" if it lies inside ANY occluder. Uses + samples x samples points (default 400).""" + if rect["width"] <= 0 or rect["height"] <= 0: + return 0.0 + if not occluder_rects: + return 1.0 + step_x = rect["width"] / samples + step_y = rect["height"] / samples + covered = 0 + total = samples * samples + for i in range(samples): + px = rect["x"] + (i + 0.5) * step_x + for j in range(samples): + py = rect["y"] + (j + 0.5) * step_y + for occ in occluder_rects: + if (occ["x"] <= px <= occ["x"] + occ["width"] + and occ["y"] <= py <= occ["y"] + occ["height"]): + covered += 1 + break + return (total - covered) / total def _scan_menu_bar(screen, top_pid): @@ -919,7 +1085,11 @@ def extract_all(screen): walk(win, results, 0, screen, clip=screen_clip, is_browser=is_browser, window_clip=screen_clip) if window_stack: - # Find overlay/dialog windows from any process that overlap the topmost app + # Find overlay/dialog windows from other processes that actually + # float ABOVE the topmost app (Spotlight, system popovers, sheets). + # Walk full window list front-to-back; stop when we reach the + # frontmost app's first layer-0 window — anything after is behind + # it and must be excluded. dialog_pids = set() skip_dialog_owners = {"Window Server", "Dock"} top_frame = top["frame"] @@ -928,8 +1098,11 @@ def extract_all(screen): if all_wins: for w in all_wins: wpid = w.get("kCGWindowOwnerPID", 0) + layer = w.get("kCGWindowLayer", -1) + if wpid == top["pid"] and layer == 0: + break # reached frontmost app's window; stop if wpid == top["pid"]: - continue # Skip topmost app's own windows + continue # frontmost app's own higher-layer windows already walked owner = w.get("kCGWindowOwnerName", "") if owner in skip_dialog_owners: continue @@ -955,10 +1128,6 @@ def extract_all(screen): if dwf and _on_screen(dwf, screen): walk(dwin, results, 0, screen, clip=dwf, window_clip=dwf) - allowed_pids = {top["pid"]} | dialog_pids - results = [e for e in results - if not _is_occluded(e, allowed_pids, window_stack)] - else: finder = find_app("com.apple.finder") if finder: @@ -1053,6 +1222,12 @@ def extract_all(screen): if dock: walk(AXUIElementCreateApplication(dock.processIdentifier()), results, 0, screen) + # ----- Real on-screen occlusion pass ----- + # Recompute each element's visibility against every window that paints + # on top of its owning window. Drop fully-covered elements so the agent + # never receives a click index for a coordinate it can't actually hit. + results = _apply_window_occlusion(results, screen) + # Deduplicate seen = set() unique = [] @@ -1063,6 +1238,11 @@ def extract_all(screen): unique.append(e) results = unique + # Strip internal helper keys before returning so they don't leak. + for e in results: + e.pop("_window_frame", None) + e.pop("_pid", None) + return app_info, menu_items, results diff --git a/Auto_Use/windows_use/remote_connection/telegram/__init__.py b/Auto_Use/windows_use/remote_connection/telegram/__init__.py new file mode 100644 index 0000000..556670d --- /dev/null +++ b/Auto_Use/windows_use/remote_connection/telegram/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# If you build on this project, please keep this header and credit +# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works. +# A small attribution goes a long way toward a healthy open-source +# community — thank you for contributing. diff --git a/Auto_Use/windows_use/remote_connection/telegram/banner.py b/Auto_Use/windows_use/remote_connection/telegram/banner.py new file mode 100644 index 0000000..37041a3 --- /dev/null +++ b/Auto_Use/windows_use/remote_connection/telegram/banner.py @@ -0,0 +1,1149 @@ +# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# If you build on this project, please keep this header and credit +# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works. +# A small attribution goes a long way toward a healthy open-source +# community — thank you for contributing. + +"""Banner — both the StatusBanner wrapper used by callers AND the +subprocess that hosts the pywebview pill. + +The same module is invoked two ways: + + 1. **Imported** from setup.py / service.py — exposes the + `StatusBanner` class that drives the wizard. Side-effect-free: + pywebview is NOT imported at module load, only inside + `_run_subprocess_banner` which the parent never calls. + + 2. **Run as `python -m …banner`** (spawned by `StatusBanner.show()` + via `subprocess.Popen`) — falls through `if __name__ == "__main__"` + into `_run_subprocess_banner`, which boots pywebview and parks on + `webview.start()`. Reads JSON commands from stdin, emits JSON + events on stdout. + +Why two roles, one file? Running pywebview's second window from a +worker thread inside the already-running AutoUse process kept landing +the pill off-screen on DPI-scaled displays. A fresh Python interpreter +(the subprocess) was the only way to dodge that DPI confusion — +`banner_test.py` standalone works perfectly on the same machine. The +subprocess body used to live in a separate `banner_proc.py` but it +doesn't need to: a single module's `__main__` guard does the same job +with one fewer file to keep in sync. + +Wire protocol (one JSON message per line): + + → stdin {"cmd": "MSG"|"SHOW_NEXT"|"HIDE_NEXT"|"SHOW_CHOICE"| + "SHOW_INPUT"|"CLEAR"|"CLOSE", ...} + ← stdout {"event": "READY"|"NEXT"|"CHOICE"|"SAVE"|"CLOSED", ...} +""" +import ctypes +import datetime +import json +import logging +import os +import subprocess +import sys +import tempfile +import threading +import time +import uuid +from queue import Queue, Empty + +logger = logging.getLogger(__name__) + +# True when this module is running inside the Nuitka-compiled AutoUse.exe +# (i.e. sys.executable is the exe, not a Python interpreter). In that case +# `python -m …banner` is meaningless — the binary has no -m loader — so +# StatusBanner.show() must re-exec AutoUse.exe with --banner-mode, which +# app.py's main() picks up and routes to _run_subprocess_banner() directly. +# Mirrors the detection in app.py:71 and the same pattern already used for +# --minion-mode in Auto_Use/windows_use/controller/view.py:697. +_IS_COMPILED = getattr(sys, "frozen", False) or "__compiled__" in globals() + + +# ── Pill geometry ───────────────────────────────────────────────────────── + +PILL_WIDTH = 580 +PILL_HEIGHT = 72 +# COMPACT_SIZE is the target square dimension for the small "telegram +# task running" indicator pill. WinForms imposes an OS-level minimum +# width (~SM_CXMINTRACK = 132+ logical pixels) on freshly created Forms, +# which stretches a smaller create_window request into a wide pill — so +# we always force the final size via window.resize() after the form is +# alive (see _on_shown). 80 is the sweet spot: small enough to read as +# an indicator, big enough to hold the 42 px orb with breathing room. +COMPACT_SIZE = 80 +SCREEN_MARGIN = 20 + + +# ── Win32 region clip + click-through (subprocess-side, but ctypes is +# stdlib so importing it at the top costs nothing for the parent) ── + +class _RECT(ctypes.Structure): + _fields_ = [ + ("left", ctypes.c_long), + ("top", ctypes.c_long), + ("right", ctypes.c_long), + ("bottom", ctypes.c_long), + ] + + +def _stderr(msg: str) -> None: + """Loud print to whichever stderr we're attached to. Used both by + the parent (for `[banner] spawned subprocess pid=…` etc.) and by + the subprocess (which inherits the parent's stderr so the messages + land in the same terminal).""" + print(f"[banner] {msg}", file=sys.stderr, flush=True) + + +def _emit(event: str, **kwargs) -> None: + """Subprocess → parent: write a JSON event to stdout (one line).""" + try: + payload = {"event": event, **kwargs} + sys.stdout.write(json.dumps(payload) + "\n") + sys.stdout.flush() + except Exception: + pass + + +# File-based event log for the subprocess. Lives at +# %LOCALAPPDATA%\AutoUse\banner_debug.log so it survives whatever happens +# to the subprocess's stdio. We log subprocess start, on_shown, events.closing, +# events.closed, exceptions, and webview.start() return — enough to point at +# the exact proximate cause if the pill ever vanishes mid-flow again. Best +# effort: any failure to write is swallowed. +def _log(msg: str) -> None: + try: + base = os.environ.get("LOCALAPPDATA") or os.path.expanduser("~") + path = os.path.join(base, "AutoUse", "banner_debug.log") + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "a", encoding="utf-8") as f: + f.write( + f"[{datetime.datetime.now().isoformat()}] pid={os.getpid()} {msg}\n" + ) + except Exception: + pass + + +def _js_escape(text: str) -> str: + return ( + (text or "") + .replace("\\", "\\\\") + .replace("'", "\\'") + .replace("\n", "\\n") + .replace("\r", "") + ) + + +def _find_hwnd(title: str) -> int: + """Locate the OS HWND for our pywebview window by title. Polls + briefly because events.shown can fire one frame before the OS lets + FindWindowW see the new window.""" + user32 = ctypes.windll.user32 + hwnd = 0 + for _ in range(40): + hwnd = user32.FindWindowW(None, title) + if hwnd: + return hwnd + time.sleep(0.025) + return 0 + + +def _make_click_through(title: str) -> None: + """Make the window pass mouse clicks to whatever is underneath it. + + Achieved by adding WS_EX_LAYERED | WS_EX_TRANSPARENT to the + extended window style. SetLayeredWindowAttributes with alpha=255 + is required after the LAYERED flag goes on or Windows treats the + window as fully invisible — we want fully visible but unclickable. + + Used by the compact "telegram task in progress" indicator pill so + it never blocks the user from clicking the desktop / other apps + beneath it; the pill is a passive visual cue, never interactive. + Matches macOS's `setIgnoresMouseEvents_(True)` on the compact + NSPanel.""" + user32 = ctypes.windll.user32 + hwnd = _find_hwnd(title) + if not hwnd: + return + GWL_EXSTYLE = -20 + WS_EX_LAYERED = 0x00080000 + WS_EX_TRANSPARENT = 0x00000020 + LWA_ALPHA = 0x00000002 + style = user32.GetWindowLongW(hwnd, GWL_EXSTYLE) + user32.SetWindowLongW( + hwnd, GWL_EXSTYLE, style | WS_EX_LAYERED | WS_EX_TRANSPARENT + ) + # WS_EX_LAYERED windows render nothing until SetLayeredWindowAttributes + # (or UpdateLayeredWindow) is called. alpha=255 → fully opaque so the + # orb still paints normally; only mouse input is what we want to drop. + user32.SetLayeredWindowAttributes(hwnd, 0, 255, LWA_ALPHA) + + +def _apply_rounded_region(title: str) -> None: + """Clip the window with the given title into a stadium pill. + + Uses FindWindowW on the unique title to locate the HWND, + GetWindowRect for the actual DPI-aware size, then SetWindowRgn for + the clip. Polls briefly because events.shown can fire one frame + before the OS lets FindWindowW see the new window.""" + user32 = ctypes.windll.user32 + gdi32 = ctypes.windll.gdi32 + + hwnd = 0 + for _ in range(40): + hwnd = user32.FindWindowW(None, title) + if hwnd: + break + time.sleep(0.025) + if not hwnd: + return + + rect = _RECT() + user32.GetWindowRect(hwnd, ctypes.byref(rect)) + w = rect.right - rect.left + h = rect.bottom - rect.top + if w <= 0 or h <= 0: + return + + # Pill: full-height end caps via corner ellipse = h × h. + rgn = gdi32.CreateRoundRectRgn(0, 0, w + 1, h + 1, h, h) + user32.SetWindowRgn(hwnd, rgn, True) + + +# ── HTML (subprocess-side only — parent never touches these strings) ── + +BANNER_HTML = r""" + + + + + + + + + + + +""" + + +COMPACT_HTML = r""" + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+ + +""" + + +# ── JS↔Python bridge (subprocess-side only) ────────────────────────────── + + +# Hard cap on pill height so a freakishly long message can't push it +# into a wall-of-text rectangle. Matches the macOS banner's MAX_H. +_MAX_PILL_HEIGHT = 200 + + +class _BannerState: + """Mutable state shared between the resize-handler and the rest of + the subprocess body. + + Deliberately NOT used as `js_api` — see _make_js_handlers.""" + + def __init__(self, title: str, width: int, min_h: int, compact: bool): + self.window = None + self.title = title + self.width = width + self.min_h = min_h + self.compact = compact + self.last_h = min_h + + +def _make_js_handlers(state: _BannerState): + """Return JS-exposed handlers as a 4-tuple of plain local functions. + + We register these via `window.expose(*funcs)` instead of the old + `js_api=_Api(...)` pattern because pywebview's util.py:get_functions + filters attributes via `inspect.ismethod(attr)` — which returns + False for bound methods of Nuitka-compiled classes. In the + compiled binary that silently drops every method on _Api, so the + JS-side `window.pywebview.api.next_clicked()` resolves to nothing + and clicks become no-ops. `window.expose()` stores functions + directly in `window._functions`, which the dispatcher checks + BEFORE falling back to js_api reflection.""" + + def next_clicked(_value=None): + _emit("NEXT") + return None + + def choice_clicked(value=None): + _emit("CHOICE", value=str(value) if value is not None else "left") + return None + + def save_clicked(value=None): + _emit("SAVE", value=value.strip() if isinstance(value, str) else "") + return None + + def height_changed(h=0): + """Resize the window to fit the reported body height, then + re-clip the (possibly taller) window into a stadium so the end + caps follow the new height. No-op for the compact pill which + has no scrollable content and a constant 80×80 size.""" + if state.compact or state.window is None: + return None + try: + target = max(state.min_h, min(_MAX_PILL_HEIGHT, int(h))) + if target == state.last_h: + return None + state.last_h = target + state.window.resize(state.width, target) + # SetWindowRgn's saved region is anchored to the OLD height, + # so without re-clipping the bottom of the now-taller window + # would render as a hard rectangle below the pill ends. + _apply_rounded_region(state.title) + except Exception: + pass + return None + + return next_clicked, choice_clicked, save_clicked, height_changed + + +# ── stdin reader thread (subprocess-side only) ─────────────────────────── + + +def _stdin_reader(window) -> None: + """Loop reading JSON commands from stdin and dispatching to the window. + + Runs on its own thread so we don't block the pywebview GUI thread.""" + _log("stdin_reader: thread started") + try: + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + msg = json.loads(line) + except Exception: + _log(f"stdin_reader: skip unparseable line {line!r}") + continue + cmd = msg.get("cmd") + _log(f"stdin_reader: cmd={cmd!r}") + try: + if cmd == "MSG": + esc = _js_escape(msg.get("text", "")) + window.evaluate_js(f"if(window.setMsg) setMsg('{esc}');") + elif cmd == "SHOW_NEXT": + window.evaluate_js("if(window.showNext) showNext();") + elif cmd == "HIDE_NEXT": + window.evaluate_js("if(window.hideNext) hideNext();") + elif cmd == "SHOW_CHOICE": + left = _js_escape(msg.get("left", "")) + right = _js_escape(msg.get("right", "")) + window.evaluate_js( + f"if(window.setChoice) setChoice('{left}', '{right}');" + ) + elif cmd == "SHOW_INPUT": + label = _js_escape(msg.get("label", "Save")) + window.evaluate_js( + f"if(window.setInput) setInput('{label}');" + ) + elif cmd == "CLEAR": + window.evaluate_js("if(window.clearAll) clearAll();") + elif cmd == "CLOSE": + _log("stdin_reader: CLOSE received, destroying window") + try: + window.destroy() + except Exception: + import traceback + _log( + "stdin_reader: window.destroy() raised:\n" + + traceback.format_exc() + ) + return + except Exception: + # Window may have been destroyed mid-flight — log and + # keep the reader alive so the process exits cleanly. + import traceback + _log( + f"stdin_reader: cmd={cmd!r} dispatch raised:\n" + + traceback.format_exc() + ) + except Exception: + import traceback + _log("stdin_reader: outer loop raised:\n" + traceback.format_exc()) + _log("stdin_reader: thread exiting (stdin EOF or pipe break)") + + +# ── subprocess entry point ──────────────────────────────────────────────── + + +def _run_subprocess_banner() -> None: + """Subprocess body. Imports webview lazily so the parent (which only + uses StatusBanner) doesn't pay its startup cost when it imports this + module. + + Mirrors `banner_test.py` byte-for-byte except for the JSON-stdio + protocol that lets the parent drive the wizard state machine.""" + _log(f"subprocess start (sys.executable={sys.executable!r})") + # Top-level guard: any exception escaping the GUI setup or webview.start() + # must land in the debug log — otherwise the user sees the pill flash and + # vanish with nothing to point at. Each step is also wrapped individually + # so we know exactly which one died. + try: + import webview + _log("webview imported") + + compact = "--compact" in sys.argv[1:] + + # Primary-screen width via Win32. GetSystemMetrics(SM_CXSCREEN=0) + # returns the DPI-virtualised value in this freshly spawned, + # still-DPI-unaware subprocess — identical to what tkinter's + # winfo_screenwidth() returned before, without dragging the + # tcl/tk runtime into the Nuitka binary (tkinter is listed in + # nofollow_third_party in windows_binary_build.py:533 and is + # therefore not bundled in the compiled exe). + try: + screen_w = ctypes.windll.user32.GetSystemMetrics(0) or 1920 + except Exception: + screen_w = 1920 + + w = COMPACT_SIZE if compact else PILL_WIDTH + h = COMPACT_SIZE if compact else PILL_HEIGHT + x = max(0, screen_w - w - SCREEN_MARGIN) + y = SCREEN_MARGIN + html = COMPACT_HTML if compact else BANNER_HTML + title = f"AutoUseBanner_{uuid.uuid4().hex[:8]}" + state = _BannerState(title=title, width=w, min_h=h, compact=compact) + next_clicked, choice_clicked, save_clicked, height_changed = ( + _make_js_handlers(state) + ) + + # No js_api here — methods on a Nuitka-compiled class fail pywebview's + # `inspect.ismethod` filter and never get exposed to JS. We register + # the handlers via window.expose() below instead. + window = webview.create_window( + title, + html=html, + width=w, + height=h, + min_size=(w, h), + x=x, + y=y, + frameless=True, + on_top=True, + easy_drag=True, + resizable=False, + ) + state.window = window + window.expose(next_clicked, choice_clicked, save_clicked, height_changed) + _log("window created and handlers exposed") + + def _on_shown(): + _log("on_shown: entered") + # Compact mode: WinForms stretches our small create_window + # request to its OS-imposed minimum width (~132+ logical px), + # producing a wide pill instead of the tight circle we want. + # A programmatic window.resize() AFTER the form is alive + # bypasses that minimum — Form.Size setter doesn't go through + # the SM_CXMINTRACK clamp the way the initial size does. We + # then re-clip the (now smaller, square) window into a circle. + if compact: + try: + window.resize(COMPACT_SIZE, COMPACT_SIZE) + # Give WinForms one frame to actually realise the new + # rect before _apply_rounded_region reads it — without + # this the region clip runs against the old wide-pill + # geometry and we lose the circle shape. + time.sleep(0.1) + except Exception: + pass + # Clip into a pill (or circle, in compact mode) and emit READY + # so the parent's show() unblocks. + _apply_rounded_region(title) + # Compact indicator is purely visual — drop mouse input so the + # user can click the desktop or any window underneath it. Only + # applied to compact mode; the standard wizard pill needs + # Next / Save / choice clicks to land. + if compact: + _make_click_through(title) + _log("on_shown: about to emit READY") + _emit("READY") + _log("on_shown: READY emitted") + # Spawn the stdin reader once the window is up. + threading.Thread( + target=_stdin_reader, args=(window,), daemon=True + ).start() + _log("on_shown: stdin reader thread spawned, exiting handler") + + window.events.shown += _on_shown + + # Lifecycle observability: log if the window starts closing or has been + # closed by anything other than our own CLOSE command. `events.closing` + # handlers must return a truthy value to allow the close; the tuple-idiom + # logs first and then yields True. + window.events.closing += lambda: (_log("event: closing"), True)[1] + window.events.closed += lambda: _log("event: closed") + + # Give the subprocess's WebView2 environment its own UserDataFolder. + # pywebview's default is %APPDATA%\pywebview ([winforms.py:704]) — shared + # process-wide. In the compiled exe the parent (main AutoUse window) and + # this subprocess are both AutoUse.exe and would otherwise contend on the + # same folder, which can cause WebView2 to tear down our renderer process + # seconds into operation. A per-PID temp folder is invisible to dev mode + # (each python interpreter already has its own folder) and isolates the + # banner subprocess cleanly in the binary build. + storage_path = os.path.join( + tempfile.gettempdir(), f"autouse_banner_{os.getpid()}" + ) + _log(f"webview.start(storage_path={storage_path!r})") + + # webview.start() runs the GUI loop in this subprocess's main thread. + # Blocks until window.destroy() — which the CLOSE command triggers. + try: + webview.start(storage_path=storage_path) + _log("webview.start() returned normally") + except Exception: + import traceback + _log("webview.start() raised:\n" + traceback.format_exc()) + raise + + _emit("CLOSED") + _log("subprocess exit (CLOSED emitted)") + except Exception: + # Catches anything escaping the GUI setup so we have a footprint + # in the log instead of just "subprocess vanished". + import traceback + _log("_run_subprocess_banner crashed:\n" + traceback.format_exc()) + raise + + +# ── parent-side wrapper ────────────────────────────────────────────────── + + +class StatusBanner: + """Drop-in Windows mirror of the macOS Cocoa banner, backed by a + subprocess that runs the pywebview pill independently.""" + + # Module path the subprocess runs. After merging banner_proc.py + # into this file, the subprocess re-executes THIS module with the + # `if __name__ == "__main__"` guard firing into + # _run_subprocess_banner(). + _PROC_MODULE = "Auto_Use.windows_use.remote_connection.telegram.banner" + + def __init__(self, compact: bool = False): + self._compact = compact + self._proc: subprocess.Popen | None = None + self._stdout_thread: threading.Thread | None = None + self._closed = threading.Event() + self._ready = threading.Event() + self._next_event = threading.Event() + # Distinguishes a real NEXT click from a subprocess-close that also + # has to unblock _next_event so waiters don't deadlock. Only the + # "NEXT" stdout event flips this to True; close-cleanup leaves it + # False so callers can tell the user dismissed the banner. + self._next_clicked = False + self._choice_q: Queue = Queue() + self._input_q: Queue = Queue() + + # ── public API ─────────────────────────────────────────────────────── + + def show(self) -> None: + if self._proc is not None or self._closed.is_set(): + return + + # In the Nuitka build, sys.executable is AutoUse.exe — a compiled C + # binary with no `-m` module loader. Running it with `-m …banner` + # silently re-execs the whole AutoUse app (Flask + main webview + + # Telegram bot), giving the user a second main window instead of + # the pill. Re-exec AutoUse.exe with --banner-mode so app.py's + # main() can route directly to _run_subprocess_banner. In dev + # (`python app.py`) sys.executable IS a python interpreter, so + # the old -m invocation still works and is preferred — it avoids + # the cost of bootstrapping app.py just to reach the banner. + # cwd: pin the subprocess to the binary's install dir in the compiled + # build so WebView2's native DLL loader resolves WebView2Loader.dll, + # WebBrowserInterop.x64.dll, etc. from the install folder regardless + # of what cwd the parent inherited (a Start-menu launch leaves cwd + # at the user's home dir; a shortcut can leave it anywhere). In dev + # mode cwd=None inherits the parent's, which is the repo root — + # matches the working behaviour. + cwd = None + if _IS_COMPILED: + exe_dir = os.path.dirname(sys.executable) + main_exe = os.path.join(exe_dir, "AutoUse.exe") + args = [main_exe, "--banner-mode"] + cwd = exe_dir + else: + args = [sys.executable, "-m", self._PROC_MODULE] + if self._compact: + args.append("--compact") + + try: + self._proc = subprocess.Popen( + args, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + # stderr is left attached so the subprocess can write + # diagnostics to our terminal (useful for debugging, + # never gets parsed). + stderr=None, + text=True, + bufsize=1, # line-buffered + cwd=cwd, + # On Windows, hide the extra console window subprocess + # would otherwise spawn. + creationflags=getattr(subprocess, "CREATE_NO_WINDOW", 0), + ) + _stderr( + f"spawned banner subprocess pid={self._proc.pid} " + f"compact={self._compact}" + ) + except Exception as e: + _stderr(f"banner subprocess spawn failed: {e!r}") + self._proc = None + return + + self._stdout_thread = threading.Thread( + target=self._stdout_reader, + daemon=True, + name="banner-stdout-reader", + ) + self._stdout_thread.start() + + # Block until the subprocess emits READY (banner is visible). + # 15 s ceiling covers a cold Python interpreter start; under + # normal conditions READY arrives in well under a second. + if not self._ready.wait(timeout=15): + _stderr("banner subprocess never emitted READY") + + def update(self, text: str) -> None: + if self._compact: + return + self._send({"cmd": "MSG", "text": text or ""}) + + def wait_for_next(self, timeout: float | None = None) -> bool: + if self._compact: + return True + if self._proc is None: + return True + # Banner already dismissed (subprocess gone) — don't pretend the + # user clicked Next. Callers use the False return to short-circuit + # the wizard instead of opening Edge / advancing steps. + if self._closed.is_set(): + return False + self._next_clicked = False + self._next_event.clear() + self._send({"cmd": "SHOW_NEXT"}) + self._next_event.wait(timeout=timeout) + self._send({"cmd": "HIDE_NEXT"}) + return self._next_clicked + + def wait_for_choice( + self, left_label: str, right_label: str, timeout=None + ): + if self._compact or self._proc is None: + return None + self._drain(self._choice_q) + self._send({ + "cmd": "SHOW_CHOICE", + "left": left_label, + "right": right_label, + }) + try: + value = self._choice_q.get(timeout=timeout if timeout else 600) + except Empty: + value = None + self._send({"cmd": "CLEAR"}) + return value + + def wait_for_input(self, save_label: str = "Save"): + if self._compact or self._proc is None: + return None + self._drain(self._input_q) + self._send({"cmd": "SHOW_INPUT", "label": save_label}) + try: + value = self._input_q.get(timeout=600) + except Empty: + value = None + self._send({"cmd": "CLEAR"}) + return value + + def close(self) -> None: + if self._closed.is_set(): + return + self._closed.set() + # Unblock anything still parked on a Queue/Event before we tear + # the subprocess down. + self._next_event.set() + try: + self._choice_q.put_nowait(None) + except Exception: + pass + try: + self._input_q.put_nowait(None) + except Exception: + pass + + if self._proc is None: + return + + # Ask the subprocess to close gracefully; fall back to terminate. + self._send({"cmd": "CLOSE"}) + try: + self._proc.wait(timeout=3) + except Exception: + try: + self._proc.terminate() + self._proc.wait(timeout=2) + except Exception: + try: + self._proc.kill() + except Exception: + pass + self._proc = None + + # ── internals ──────────────────────────────────────────────────────── + + def _send(self, msg: dict) -> None: + """Write a JSON command to the subprocess stdin. Silent on + broken-pipe errors so a dead subprocess doesn't crash callers.""" + if self._proc is None or self._proc.stdin is None: + return + try: + self._proc.stdin.write(json.dumps(msg) + "\n") + self._proc.stdin.flush() + except Exception: + pass + + def _stdout_reader(self) -> None: + """Read JSON events from the subprocess and route to local + Event / Queue primitives so wait_for_* unblock at the right time.""" + if self._proc is None or self._proc.stdout is None: + return + for line in self._proc.stdout: + line = line.strip() + if not line: + continue + try: + msg = json.loads(line) + except Exception: + continue + event = msg.get("event") + if event == "READY": + _stderr("banner subprocess READY — pill visible") + self._ready.set() + elif event == "NEXT": + # Flag must be set BEFORE the Event so a waiter that wakes + # on _next_event.wait() reads the True value, not the + # default False left by the close-cleanup path below. + self._next_clicked = True + self._next_event.set() + elif event == "CHOICE": + self._choice_q.put(msg.get("value", "left")) + elif event == "SAVE": + self._input_q.put(msg.get("value", "")) + elif event == "CLOSED": + _stderr("banner subprocess CLOSED") + break + + # Subprocess exited (whether via CLOSED or pipe break). Unblock + # any pending waiters so callers don't deadlock. + self._closed.set() + self._ready.set() + self._next_event.set() + try: + self._choice_q.put_nowait(None) + except Exception: + pass + try: + self._input_q.put_nowait(None) + except Exception: + pass + + @staticmethod + def _drain(q: Queue) -> None: + try: + while True: + q.get_nowait() + except Empty: + pass + + +# ── module entry: run as subprocess if invoked via `python -m …banner` ── + +if __name__ == "__main__": + _run_subprocess_banner() diff --git a/Auto_Use/windows_use/remote_connection/telegram/pair.html b/Auto_Use/windows_use/remote_connection/telegram/pair.html deleted file mode 100644 index cc7b159..0000000 --- a/Auto_Use/windows_use/remote_connection/telegram/pair.html +++ /dev/null @@ -1,220 +0,0 @@ - - - - - -Auto Use — Pair - - - - -
- -
Connect Telegram for remote control
- -
-
1
-
Open @BotFather and send /newbot
-
-
-
2
-
Name it Auto Use and pick any username
-
-
-
3
-
Paste the token below
-
- - - -
-
- -
-
-
Connected!
-
Send tasks from Telegram to control your PC
- Open Bot -
- - - - \ No newline at end of file diff --git a/Auto_Use/windows_use/remote_connection/telegram/service.py b/Auto_Use/windows_use/remote_connection/telegram/service.py index 5d0f650..55e2a5c 100644 --- a/Auto_Use/windows_use/remote_connection/telegram/service.py +++ b/Auto_Use/windows_use/remote_connection/telegram/service.py @@ -17,11 +17,34 @@ # A small attribution goes a long way toward a healthy open-source # community — thank you for contributing. +"""Telegram → AgentService bridge with a guided provider/model picker. + +Runs as a standalone process (not mounted into Flask). On the first message +the bot asks you to pick a provider (limited to providers with a non-empty +key in api_key.txt / .env), then a model (from the same MODEL_MAPPINGS the +AutoUse frontend uses). Subsequent messages are dispatched as tasks to the +agent with that provider/model. Picked provider/model persist for the whole +chat session until you `/reset`. + +Token lookup order (first non-empty wins): + 1. TELEGRAM_BOT_TOKEN env var + 2. .env at the project root + 3. Auto_Use/api_key/api_key.txt + +Setup: + 1. @BotFather → /newbot → copy token. + 2. Paste it into .env OR api_key.txt as TELEGRAM_BOT_TOKEN=… + 3. Make sure at least one provider key (e.g. OPENROUTER_API_KEY=…) is set. + 4. python -m Auto_Use.windows_use.remote_connection.telegram.service + 5. On phone: open Telegram, find your bot, send any message. +""" import asyncio -import threading +import datetime +import importlib import logging +import sys +import threading from pathlib import Path -from typing import Optional from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup from telegram.ext import ( @@ -30,298 +53,798 @@ MessageHandler, CallbackQueryHandler, filters, - ContextTypes, ) logger = logging.getLogger(__name__) -# AgentService is imported lazily inside _run_agent so this module (and the Telegram -# polling thread) can start without loading tree/element → skimage until a task runs. -# service.py -> telegram -> remote_connection -> windows_use -> Auto_Use / api_key / api_key.txt -API_KEY_FILE = Path(__file__).parent.parent.parent.parent / "api_key" / "api_key.txt" -SCRATCHPAD_PATH = Path(__file__).parent.parent / "scratchpad" / "milestone" / "milestone.md" +# The Telegram surface treats api_key.txt as its single source of truth — we +# deliberately do NOT consult .env or env vars here. .env is app.py's general +# env-loading concern; keeping the bot self-contained against api_key.txt +# avoids two-files-of-record confusion. +# +# Resolve api_key.txt the same way app.py's get_auto_use_path() does: in a +# compiled/frozen build __file__ points INSIDE the bundle, so the parents[4] +# walk would miss the editable api_key.txt that lives next to the executable +# (the one the Settings panel and the regular agent use). Fall back to the +# source-tree path in dev (python app.py). +_IS_COMPILED = getattr(sys, "frozen", False) or "__compiled__" in globals() +if _IS_COMPILED: + _API_KEY_FILE = Path(sys.executable).parent / "Auto_Use" / "api_key" / "api_key.txt" +else: + # service.py → telegram → remote_connection → windows_use → Auto_Use → repo root + _API_KEY_FILE = ( + Path(__file__).resolve().parents[4] / "Auto_Use" / "api_key" / "api_key.txt" + ) + +# Agent writes per-step "milestone" lines here. We tail this file during a +# task and forward each new line back to the user's Telegram chat so they +# see the agent's progress in real time. +SCRATCHPAD_PATH = ( + Path(__file__).resolve().parents[2] / "scratchpad" / "milestone" / "milestone.md" +) +SCRATCHPAD_POLL_SEC = 2.0 +MAX_TG_MSG_LEN = 4000 # Telegram caps at 4096; leave headroom for safety +# Provider id → API-key name in the KV files. Same mapping the macOS side +# uses ([macOS_use/remote_connection/telegram/service.py:78-85]). PROVIDER_KEY_MAP = { - 'openrouter': 'OPENROUTER_API_KEY', - 'groq': 'GROQ_API_KEY', - 'openai': 'OPENAI_API_KEY', - 'anthropic': 'ANTHROPIC_API_KEY', - 'google': 'GOOGLE_API_KEY', - 'perplexity': 'PERPLEXITY_API_KEY', + "openrouter": "OPENROUTER_API_KEY", + "groq": "GROQ_API_KEY", + "openai": "OPENAI_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + "google": "GOOGLE_API_KEY", + "perplexity": "PERPLEXITY_API_KEY", } -def _read_api_keys() -> dict: - """Read api_key.txt and return dict of key_name -> value.""" - keys = {} - if API_KEY_FILE.exists(): +# ── file helpers ───────────────────────────────────────────────────────────── + +def _read_all_keys(path: Path) -> dict: + """Parse a simple KEY=VALUE file (one per line) into a dict. Skips empty + values and lines starting with '#'.""" + out = {} + if not path.exists(): + return out + try: + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, _, v = line.partition("=") + k, v = k.strip(), v.strip() + if v: + out[k] = v + except Exception: + pass + return out + + +def _resolve_token() -> str | None: + """Read TELEGRAM_BOT_TOKEN from api_key.txt only. .env and env vars are + intentionally ignored — see header comment.""" + return _read_all_keys(_API_KEY_FILE).get("TELEGRAM_BOT_TOKEN") + + +def _get_available_providers() -> list: + """Providers with a non-empty key in api_key.txt only.""" + keys = _read_all_keys(_API_KEY_FILE) + return [ + {"id": pid, "key": keys[kname]} + for pid, kname in PROVIDER_KEY_MAP.items() + if keys.get(kname) + ] + + +def _set_key_in_file(path: Path, key: str, value: str) -> None: + """Write/update KEY=value in a KV file, preserving every other line. + + Unlike a naive read-all-and-write-back-with-_read_all_keys, this keeps + empty-value placeholder lines (e.g. GROQ_API_KEY=) intact — the AutoUse + UI relies on those for its provider list rendering. + """ + lines = [] + found = False + if path.exists(): try: - with open(API_KEY_FILE, 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if '=' in line: - name, _, value = line.partition('=') - keys[name] = value + with open(path, "r", encoding="utf-8") as f: + for raw in f: + stripped = raw.strip() + if stripped.startswith(f"{key}="): + lines.append(f"{key}={value}\n") + found = True + else: + lines.append(raw if raw.endswith("\n") else raw + "\n") except Exception: - pass - return keys - - -def _get_available_providers() -> list[dict]: - """Return providers that have a non-empty API key in api_key.txt.""" - keys = _read_api_keys() - available = [] - for provider_id, key_name in PROVIDER_KEY_MAP.items(): - if keys.get(key_name, '').strip(): - available.append({'id': provider_id, 'key': keys[key_name]}) - return available - - -def _get_models_for_provider(provider_id: str) -> list[dict]: - """Import the view module for a provider and return its non-hidden models.""" - view_modules = { - 'openrouter': 'Auto_Use.windows_use.llm_provider.openrouter.view', - 'groq': 'Auto_Use.windows_use.llm_provider.groq.view', - 'openai': 'Auto_Use.windows_use.llm_provider.openai.view', - 'anthropic': 'Auto_Use.windows_use.llm_provider.anthropic.view', - 'google': 'Auto_Use.windows_use.llm_provider.google.view', - 'perplexity': 'Auto_Use.windows_use.llm_provider.perplexity.view', - } - module_path = view_modules.get(provider_id) - if not module_path: - return [] + logger.warning("failed to read %s while updating %s", path, key) + return + if not found: + lines.append(f"{key}={value}\n") + try: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + f.writelines(lines) + except Exception: + logger.warning("failed to write %s", path) + + +def _resolve_owner_chat_id() -> int | None: + """Owner chat_id = whoever last sent /start. Stored in api_key.txt as + TELEGRAM_OWNER_CHAT_ID=…, so it survives restarts.""" + val = _read_all_keys(_API_KEY_FILE).get("TELEGRAM_OWNER_CHAT_ID") + if not val: + return None try: - import importlib - mod = importlib.import_module(module_path) - mappings = getattr(mod, 'MODEL_MAPPINGS', {}) + return int(val) + except ValueError: + return None + + +def _save_owner_chat_id(chat_id: int) -> None: + """Persist the owner chat_id so we can message them on the next boot.""" + _set_key_in_file(_API_KEY_FILE, "TELEGRAM_OWNER_CHAT_ID", str(chat_id)) + + +def _get_models_for_provider(provider_id: str) -> list: + """Read MODEL_MAPPINGS from Auto_Use/windows_use/llm_provider//view.py + and return non-hidden entries as [{id, display_name}, …].""" + try: + mod = importlib.import_module( + f"Auto_Use.windows_use.llm_provider.{provider_id}.view" + ) + mappings = getattr(mod, "MODEL_MAPPINGS", {}) return [ - {'id': model_id, 'display_name': info.get('display_name', model_id)} - for model_id, info in mappings.items() - if not info.get('hidden', False) + {"id": mid, "display_name": info.get("display_name", mid)} + for mid, info in mappings.items() + if not info.get("hidden", False) ] except Exception: return [] -class TelegramAgentBot: - """Telegram bot that lets users pick a provider/model and run agent tasks.""" - - def __init__(self, token: str): - self._token = token - self._busy = False - self._stop_event: Optional[threading.Event] = None - self._pending: dict = {} # chat_id -> {task, provider, api_key} - - # ── helpers ─────────────────────────────────────────────────────────── - - def _monitor_scratchpad(self, chat_id: int, loop, bot, stop_event: threading.Event): - """Poll the scratchpad file every 5s and send new lines to the Telegram chat.""" - last_pos = 0 - while not stop_event.is_set(): - if SCRATCHPAD_PATH.exists(): - try: - with open(SCRATCHPAD_PATH, 'r', encoding='utf-8') as f: - f.seek(last_pos) - new_content = f.read() - if new_content: - last_pos = f.tell() - lines = new_content.strip().split('\n') - for line in lines: - if line.strip(): - text = line.strip() - for chunk in [text[i:i+4096] for i in range(0, len(text), 4096)]: - asyncio.run_coroutine_threadsafe( - bot.send_message(chat_id=chat_id, text=chunk), loop - ) - except Exception as exc: - logger.warning("Scratchpad read error: %s", exc) - stop_event.wait(5) - - # Final sweep - if SCRATCHPAD_PATH.exists(): +# ── per-chat state ─────────────────────────────────────────────────────────── + +# chat_id → { +# "phase": "idle" | "pick_provider" | "pick_model" | "ready" | "running", +# "provider": str | None, +# "model": str | None, +# "model_display": str | None, +# "queue": list[str], # tasks waiting to run, FIFO +# "pending": dict[str, str], # pending_id → task awaiting Yes/No +# "pending_counter": int, # monotonic id source for pending +# } +_chat_state: dict = {} + +# Guards mutations that read+modify state across threads (queue drain races +# between _run_agent's finally and the callback handler tapping "Yes"). +_state_lock = threading.Lock() + + +def _state(chat_id: int) -> dict: + return _chat_state.setdefault(chat_id, {"phase": "idle"}) + + +def _maybe_run_next_queued(chat_id: int, bot, loop) -> None: + """If this chat is ready and has a queued task, pop the next one and + start it. Threadsafe — called from both _run_agent's finally (worker + thread) and the q+ callback (asyncio loop).""" + with _state_lock: + state = _chat_state.get(chat_id) + if not state: + return + if state.get("phase") != "ready": + return + queue = state.get("queue") or [] + if not queue: + return + provider = state.get("provider") + model = state.get("model") + if not provider or not model: + return + next_task = queue.pop(0) + display = state.get("model_display") or model + state["phase"] = "running" + + _send_chat( + bot, + chat_id, + f"📝 Running queued task: {next_task[:200]} ({provider} · {display})", + loop, + ) + threading.Thread( + target=_run_agent, + args=(next_task, provider, model, chat_id, bot, loop), + daemon=True, + name=f"telegram-agent-{chat_id}-queued", + ).start() + + +# ── Telegram handlers ──────────────────────────────────────────────────────── + +def _build_online_text(providers: list) -> str: + now_str = datetime.datetime.now().strftime("%H:%M:%S") + if providers: + provider_line = ", ".join(p["id"] for p in providers) + return f"🟢 AutoUse online at {now_str}\nProviders: {provider_line}" + return f"🟢 AutoUse online at {now_str}\nProviders: (none configured)" + + +async def _show_provider_picker(message): + providers = _get_available_providers() + # Always lead with the "AutoUse online" status line so the user gets the + # same greeting they'd see at app boot, even when they message the bot + # first instead of waiting for the unsolicited startup announcement. + await message.reply_text(_build_online_text(providers)) + if not providers: + await message.reply_text( + "⚠️ No provider API keys found. Add at least one (e.g. " + "OPENROUTER_API_KEY=…) to api_key.txt or .env and try again." + ) + return False + buttons = [ + [InlineKeyboardButton(p["id"], callback_data=f"provider:{p['id']}")] + for p in providers + ] + await message.reply_text( + "👋 Pick a provider:", reply_markup=InlineKeyboardMarkup(buttons) + ) + return True + + +async def _discover_owner_from_updates(bot) -> int | None: + """Peek at the latest pending update on Telegram's servers and use its + chat_id as the owner. Lets the bot self-bootstrap on the very first run + after the chat-saving code was deployed, without requiring the user to + /start again. Safe to call before start_polling — uses offset=-1 which + Telegram supports as 'just the most recent update', and doesn't consume + updates from the polling updater's offset cursor.""" + try: + updates = await bot.get_updates(offset=-1, limit=1, timeout=2) + except Exception: + logger.warning("owner discovery: get_updates failed", exc_info=True) + return None + for upd in updates: + chat = getattr(upd, "effective_chat", None) + if chat and chat.id: + return int(chat.id) + return None + + +async def _post_init(application) -> None: + """Fires once after the bot finishes initialising (before polling starts). + Used to message the saved owner: 'AutoUse online at …' + a fresh provider + picker — so the user doesn't have to send anything to get going.""" + owner_id = _resolve_owner_chat_id() + if not owner_id: + # Not saved yet — try to auto-discover from Telegram's pending updates. + # Works if the user has ever messaged the bot, even before the + # chat-saving code was deployed. Persist the result so we don't need + # to re-discover on every boot. + owner_id = await _discover_owner_from_updates(application.bot) + if owner_id: try: - with open(SCRATCHPAD_PATH, 'r', encoding='utf-8') as f: - f.seek(last_pos) - new_content = f.read() - if new_content: - for line in new_content.strip().split('\n'): - if line.strip(): - asyncio.run_coroutine_threadsafe( - bot.send_message(chat_id=chat_id, text=line.strip()), loop - ) + _save_owner_chat_id(owner_id) + logger.info( + "owner discovery: saved chat_id=%s from getUpdates", + owner_id, + ) except Exception: - pass + logger.warning("owner discovery: could not persist chat_id", exc_info=True) + if not owner_id: + # No owner anywhere — they've never interacted with the bot. Stay + # silent; they'll register themselves with /start. + return + bot = application.bot + providers = _get_available_providers() + try: + await bot.send_message(chat_id=owner_id, text=_build_online_text(providers)) + except Exception: + logger.exception("startup announcement: failed to send hello") + return # if we can't even greet, don't bother with the picker - def _run_agent(self, task: str, provider: str, model: str, api_key: str, - chat_id: int, loop, bot): + if not providers: try: - from ...agent.service import AgentService - - agent = AgentService( - provider=provider, - model=model, - save_conversation=True, - thinking=True, - api_key=api_key, - stop_event=self._stop_event, + await bot.send_message( + chat_id=owner_id, + text="⚠️ No provider API keys found. Add at least one to api_key.txt and /reset.", ) + except Exception: + pass + return - monitor = threading.Thread( - target=self._monitor_scratchpad, - args=(chat_id, loop, bot, self._stop_event), - daemon=True, - ) - monitor.start() - - agent.process_request(task) - - asyncio.run_coroutine_threadsafe( - bot.send_message(chat_id=chat_id, text="✅ Task completed."), loop - ) - except Exception as exc: - logger.exception("Agent error") - asyncio.run_coroutine_threadsafe( - bot.send_message(chat_id=chat_id, text=f"❌ Agent error: {exc}"), loop - ) - finally: - self._busy = False - self._stop_event = None - self._pending.pop(chat_id, None) + buttons = [ + [InlineKeyboardButton(p["id"], callback_data=f"provider:{p['id']}")] + for p in providers + ] + try: + await bot.send_message( + chat_id=owner_id, + text="👋 Pick a provider:", + reply_markup=InlineKeyboardMarkup(buttons), + ) + # Park the owner's chat in pick_provider so the next button tap routes + # cleanly through the existing callback flow. + _chat_state[owner_id] = {"phase": "pick_provider"} + except Exception: + logger.exception("startup announcement: failed to send provider picker") - # ── Telegram handlers ──────────────────────────────────────────────── - async def start_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE): +async def start_cmd(update, ctx): + chat_id = update.effective_chat.id + # Remember this chat so future boots can auto-greet (startup announcement). + # Best-effort — never let a file-write failure block /start. + try: + _save_owner_chat_id(chat_id) + except Exception: + logger.warning("could not persist owner chat_id", exc_info=True) + _chat_state[chat_id] = {"phase": "pick_provider"} + ok = await _show_provider_picker(update.message) + if not ok: + _chat_state[chat_id] = {"phase": "idle"} + + +async def reset_cmd(update, ctx): + # Wipe state for this chat — including any queued tasks and pending + # awaiting Yes/No prompts. We do NOT clear the persisted owner chat_id; + # /reset is "start over the conversation", not "forget I exist". + _chat_state[update.effective_chat.id] = {"phase": "idle"} + await update.message.reply_text( + "🔄 Reset. Send any message to pick a provider again." + ) + + +async def text_handler(update, ctx): + chat_id = update.effective_chat.id + # Persist on every message, not just /start, so the next app boot can + # auto-announce "AutoUse online" without the user having to /start first. + try: + _save_owner_chat_id(chat_id) + except Exception: + logger.warning("could not persist owner chat_id", exc_info=True) + state = _state(chat_id) + phase = state.get("phase", "idle") + + if phase in ("idle", "pick_provider"): + state["phase"] = "pick_provider" + ok = await _show_provider_picker(update.message) + if not ok: + state["phase"] = "idle" + return + + if phase == "pick_model": await update.message.reply_text( - "👋 Send me a task and I will execute it on the desktop.\n\n" - "Commands:\n" - "/stop – abort current task\n" - "/status – check if a task is running" + "Pick a model from the buttons above first." ) + return - async def stop_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE): - if self._stop_event and self._busy: - self._stop_event.set() - await update.message.reply_text("🛑 Stop signal sent.") - else: - await update.message.reply_text("No task is running.") - - async def status_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE): - if self._busy: - await update.message.reply_text("⏳ A task is currently running. Send /stop to cancel.") - else: - await update.message.reply_text("💤 Idle – send a message to start a task.") - - async def task_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE): - """User sends a text message → store it as a pending task, show provider buttons.""" - if self._busy: - await update.message.reply_text( - "⏳ A task is already running. Send /stop first, then try again." - ) - return - - task = update.message.text.strip() + if phase == "running": + # Busy — offer to queue this task. Each pending prompt gets a unique + # id so multiple "queue this?" prompts can coexist if the user spams. + task = (update.message.text or "").strip() if not task: return - - providers = _get_available_providers() - if not providers: - await update.message.reply_text( - "⚠️ No API keys configured.\n" - "Add provider API keys through the Auto Use desktop app settings first." + state.setdefault("pending", {}) + state["pending_counter"] = state.get("pending_counter", 0) + 1 + pending_id = str(state["pending_counter"]) + state["pending"][pending_id] = task + buttons = [[ + InlineKeyboardButton("✅ Yes, queue it", callback_data=f"q+:{pending_id}"), + InlineKeyboardButton("❌ No", callback_data=f"q-:{pending_id}"), + ]] + await update.message.reply_text( + f"⏳ Currently busy performing a task.\n" + f"Do you want to queue: \"{task[:200]}\" ?", + reply_markup=InlineKeyboardMarkup(buttons), + ) + return + + # phase == "ready" + task = (update.message.text or "").strip() + if not task: + return + state["phase"] = "running" + provider = state["provider"] + model = state["model"] + display = state.get("model_display", model) + await update.message.reply_text( + f"📝 Running: {task} ({provider} · {display})" + ) + bot = ctx.bot + loop = asyncio.get_running_loop() + threading.Thread( + target=_run_agent, + args=(task, provider, model, chat_id, bot, loop), + daemon=True, + ).start() + + +async def callback_handler(update, ctx): + query = update.callback_query + await query.answer() + chat_id = query.message.chat_id + try: + _save_owner_chat_id(chat_id) + except Exception: + logger.warning("could not persist owner chat_id", exc_info=True) + state = _state(chat_id) + data = query.data or "" + + if data.startswith("provider:"): + provider_id = data.split(":", 1)[1] + state["provider"] = provider_id + state["phase"] = "pick_model" + models = _get_models_for_provider(provider_id) + if not models: + state["phase"] = "pick_provider" + await query.edit_message_text( + f"⚠️ No models found for {provider_id}. Pick another provider." ) return - - chat_id = update.effective_chat.id - self._pending[chat_id] = {'task': task} - buttons = [ - [InlineKeyboardButton(p['id'], callback_data=f"provider:{p['id']}")] - for p in providers + [InlineKeyboardButton(m["display_name"], callback_data=f"model:{m['id']}")] + for m in models ] - await update.message.reply_text( - f"📝 Task received:\n{task}\n\nChoose a provider:", + await query.edit_message_text( + f"Pick a model for {provider_id}:", reply_markup=InlineKeyboardMarkup(buttons), ) + return + + if data.startswith("model:"): + model_id = data.split(":", 1)[1] + provider_id = state.get("provider") + if not provider_id: + state["phase"] = "idle" + await query.edit_message_text("Session expired. Send any message to start over.") + return + models = _get_models_for_provider(provider_id) + display = next( + (m["display_name"] for m in models if m["id"] == model_id), model_id + ) + state["model"] = model_id + state["model_display"] = display + state["phase"] = "ready" + await query.edit_message_text( + f"✅ Provider: {provider_id} / Model: {display}\n" + f"Send me a task whenever you're ready." + ) + return - async def callback_handler(self, update: Update, context: ContextTypes.DEFAULT_TYPE): - """Handle inline-keyboard button presses for provider/model selection.""" - query = update.callback_query - await query.answer() - chat_id = query.message.chat_id - data = query.data - - pending = self._pending.get(chat_id) - if not pending: - await query.edit_message_text("Session expired. Send a new task.") + if data.startswith("q+:"): + # User wants to queue the pending task. + pending_id = data.split(":", 1)[1] + task = (state.get("pending") or {}).pop(pending_id, None) + if not task: + await query.edit_message_text("(That prompt has already been handled.)") return + state.setdefault("queue", []).append(task) + qlen = len(state["queue"]) + await query.edit_message_text( + f"📥 Queued (position {qlen}): \"{task[:200]}\"\n" + f"Will run when the current task finishes." + ) + # Edge case: agent finished in the milliseconds between the prompt + # being sent and the user tapping Yes. Drain the queue now so the + # queued task isn't stranded. + _maybe_run_next_queued(chat_id, ctx.bot, asyncio.get_running_loop()) + return + + if data.startswith("q-:"): + # User declines to queue. Drop the pending task. + pending_id = data.split(":", 1)[1] + (state.get("pending") or {}).pop(pending_id, None) + await query.edit_message_text( + "👍 OK, won't queue it. I'll let you know once the current task is done." + ) + return - if data.startswith("provider:"): - provider_id = data.split(":", 1)[1] - providers = _get_available_providers() - api_key = next((p['key'] for p in providers if p['id'] == provider_id), None) - if not api_key: - await query.edit_message_text("⚠️ API key for this provider is no longer available.") - self._pending.pop(chat_id, None) - return - - pending['provider'] = provider_id - pending['api_key'] = api_key - - models = _get_models_for_provider(provider_id) - if not models: - await query.edit_message_text(f"⚠️ No models found for {provider_id}.") - self._pending.pop(chat_id, None) - return - - buttons = [ - [InlineKeyboardButton(m['display_name'], callback_data=f"model:{m['id']}")] - for m in models - ] - await query.edit_message_text( - f"Provider: {provider_id}\n\nChoose a model:", - reply_markup=InlineKeyboardMarkup(buttons), - ) - elif data.startswith("model:"): - model_id = data.split(":", 1)[1] - provider = pending.get('provider') - api_key = pending.get('api_key') - task = pending.get('task') +# ── scratchpad streaming ───────────────────────────────────────────────────── + +def _send_chat(bot, chat_id, text, loop, wait: bool = False, timeout: float = 5.0): + """Schedule a bot.send_message on the asyncio loop from a worker thread. + Silently ignores failures so a transient send error never kills the + monitor thread. + + When wait=True, block the calling thread until the send actually + completes (or `timeout` seconds elapse). Used for terminal messages + like "✅ Done." that must land in the chat BEFORE the next message + is scheduled — without it, the "Done" send and the "Running queued + task" send race inside the asyncio loop as two parallel HTTP POSTs + and Telegram can deliver them out of order.""" + try: + fut = asyncio.run_coroutine_threadsafe( + bot.send_message(chat_id=chat_id, text=text), loop + ) + if wait: + try: + fut.result(timeout=timeout) + except Exception: + logger.warning( + "send_message to chat %s did not confirm within %ss", + chat_id, timeout, exc_info=True, + ) + except Exception: + logger.warning("Failed to schedule send_message to chat %s", chat_id) + + +def _monitor_scratchpad(chat_id, bot, loop, stop_event, start_pos): + """Tail SCRATCHPAD_PATH and forward each new non-empty line to the chat. + + Polls every SCRATCHPAD_POLL_SEC seconds. start_pos is the byte offset + the file was at when the task began — we only forward content written + AFTER that, so old milestones from previous tasks aren't replayed. + Exits when stop_event is set, after one final sweep to flush any tail. + """ + last_pos = start_pos + + def _read_and_forward(): + nonlocal last_pos + if not SCRATCHPAD_PATH.exists(): + # File was deleted (e.g. AgentService.__init__ wiping the + # scratchpad). Reset so the next poll re-reads the whole new + # file from the top instead of seeking past its end. + last_pos = 0 + return + try: + # Defensive: if the file shrank below last_pos it was truncated + # or rotated; restart from byte 0 so we don't slice into the + # middle of fresh content and stream a fragment. + try: + current_size = SCRATCHPAD_PATH.stat().st_size + if current_size < last_pos: + last_pos = 0 + except Exception: + pass + with open(SCRATCHPAD_PATH, "r", encoding="utf-8", errors="replace") as f: + f.seek(last_pos) + new_content = f.read() + if not new_content: + return + last_pos = f.tell() + except Exception as exc: + logger.warning("Scratchpad read error: %s", exc) + return + for raw in new_content.splitlines(): + line = raw.strip() + if not line: + continue + # Chunk excessively long lines so we stay under Telegram's 4096 cap. + for i in range(0, len(line), MAX_TG_MSG_LEN): + _send_chat(bot, chat_id, line[i : i + MAX_TG_MSG_LEN], loop) + + while not stop_event.is_set(): + _read_and_forward() + stop_event.wait(SCRATCHPAD_POLL_SEC) + + # Final sweep — catches any line written between the last poll and the + # stop_event being set (e.g. the agent's very last milestone). + _read_and_forward() + + +# ── agent runner (worker thread) ───────────────────────────────────────────── + +def _run_agent(task, provider, model, chat_id, bot, loop): + """Run the agent and ping the chat when done. Streams scratchpad milestones + back to the chat live while the agent works. Pops a compact pill so the + Windows user can see a Telegram task is running, and minimises the main + app window so the agent has the screen to itself. Restores phase to + 'ready'.""" + # Compact "Telegram task in progress" indicator + minimise AutoUse window. + # Both are best-effort — never let UI fluff block the actual task. + from Auto_Use.windows_use.remote_connection.telegram.banner import StatusBanner + task_banner = StatusBanner(compact=True) + try: + task_banner.show() + except Exception: + logger.warning("could not show task banner", exc_info=True) + # Minimise the AutoUse pywebview window so the agent has the screen to + # itself. We talk to pywebview directly via its global `windows` list + # rather than importing from app.py — `python app.py` makes app.py the + # __main__ module, so `from app import …` would re-import a *second* + # copy of app.py whose webview_window is still None, and the call would + # silently no-op. + try: + import webview as _webview + if _webview.windows: + _webview.windows[0].minimize() + except Exception: + logger.warning("could not minimise AutoUse window", exc_info=True) + + # Reset the milestone scratchpad to empty before starting the monitor. + # AgentService.__init__ wipes the entire scratchpad/ directory in + # _cleanup_scratchpad() — so if we snapshotted the file's current size + # here and the agent then deleted + rewrote it, the monitor's last_pos + # would point mid-way into the fresh content and we'd stream a + # fragment (e.g. "ome." instead of "Verified: …Edge.") to the chat. + # Deleting the file ourselves up front and starting from byte 0 keeps + # the monitor aligned with whatever the agent writes next. Best-effort + # — a failure here just degrades us back to the old (buggy) behavior. + try: + if SCRATCHPAD_PATH.exists(): + SCRATCHPAD_PATH.unlink() + except Exception: + logger.warning("could not reset milestone scratchpad", exc_info=True) + start_pos = 0 + stop_event = threading.Event() + monitor = threading.Thread( + target=_monitor_scratchpad, + args=(chat_id, bot, loop, stop_event, start_pos), + daemon=True, + name=f"telegram-scratchpad-{chat_id}", + ) + monitor.start() + + try: + # Imported lazily — pulls in tree/element → skimage etc., which we + # don't want to load until a task actually runs. + from Auto_Use.windows_use.agent.service import AgentService + + # Look up the runtime API key for the chosen provider so + # LLMManager doesn't fall back to an os.getenv() that the user + # never set. Mirrors app.py's get_provider_api_key path — + # Telegram users edit api_key.txt (or use the AutoUse Settings + # panel), they don't export env vars, so without passing + # `api_key=` here the agent dies before its first scan with + # "X API key not provided and not found in .env file". + # _get_available_providers already gated the picker to non- + # empty keys, so the lookup is guaranteed to return a value. + provider_key_name = PROVIDER_KEY_MAP.get(provider) + provider_keys = _read_all_keys(_API_KEY_FILE) + provider_api_key = ( + provider_keys.get(provider_key_name) if provider_key_name else None + ) - if not all([provider, api_key, task]): - await query.edit_message_text("Session expired. Send a new task.") - self._pending.pop(chat_id, None) - return + agent = AgentService( + provider=provider, + model=model, + save_conversation=False, + thinking=True, + api_key=provider_api_key, + ) + agent.process_request(task) + # Stop the monitor BEFORE the done message so the final scratchpad + # sweep happens first — keeps the chat in correct chronological order. + stop_event.set() + monitor.join(timeout=SCRATCHPAD_POLL_SEC + 2) + # wait=True: block until "✅ Done." is on Telegram's servers before + # the finally-block fires _maybe_run_next_queued, which would + # otherwise schedule "📝 Running queued task: …" as a second, + # concurrent HTTP POST that can race past Done in delivery. + _send_chat(bot, chat_id, "✅ Done.", loop, wait=True) + except Exception as e: + logger.exception("agent error") + stop_event.set() + monitor.join(timeout=SCRATCHPAD_POLL_SEC + 2) + _send_chat(bot, chat_id, f"❌ Error: {e}", loop, wait=True) + finally: + if not stop_event.is_set(): + stop_event.set() + try: + task_banner.close() + except Exception: + pass + with _state_lock: + state = _chat_state.get(chat_id) + if state is not None and state.get("phase") == "running": + state["phase"] = "ready" + # Drain one queued task if any — keeps phase='running' if it spawns. + _maybe_run_next_queued(chat_id, bot, loop) + + +# ── entry points ───────────────────────────────────────────────────────────── + +def _build_telegram_app(token: str): + """Build a python-telegram-bot Application with all our handlers wired. + + `post_init` is the hook python-telegram-bot calls once after the bot + finishes initialising but before polling starts — perfect spot to send + the "AutoUse online" announcement + provider picker to the saved owner. + """ + app = ( + Application.builder() + .token(token) + .post_init(_post_init) + .build() + ) + app.add_handler(CommandHandler("start", start_cmd)) + app.add_handler(CommandHandler("reset", reset_cmd)) + app.add_handler(CallbackQueryHandler(callback_handler)) + app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, text_handler)) + return app + + +_BOT_THREAD: threading.Thread | None = None + + +def _stderr(msg: str) -> None: + """Loud print to the terminal where python app.py is running — bypasses + whatever logging config is in effect so the user actually sees it.""" + import sys + print(f"[telegram] {msg}", file=sys.stderr, flush=True) + + +async def _run_bot_until_stopped(tg_app): + """Manual lifecycle replacement for Application.run_polling(). + + run_polling() messes with signals and assumes it owns the main thread; + we want to drive it from a worker thread so we do it step by step. + + Order matches what run_polling() does internally: + initialize → start → post_init → start_polling. + We call _post_init BEFORE start_polling so its bot.get_updates(offset=-1) + auto-discovery doesn't race with the updater's own polling loop. + """ + await tg_app.initialize() + await tg_app.start() + # Application.post_init() is only invoked by run_polling(), not by the + # manual initialize+start path above. Call our startup announcement + # explicitly so the saved owner gets the "AutoUse online" message. + try: + await _post_init(tg_app) + except Exception: + logger.exception("post_init failed") + await tg_app.updater.start_polling(allowed_updates=Update.ALL_TYPES) + _stderr("polling loop is live — send your bot a message") + # Park here forever (daemon thread; killed on app exit). + await asyncio.Event().wait() + + +def start_bot() -> None: + """Start the Telegram bot polling on a daemon thread. + + Idempotent — safe to call multiple times from app.py boot. Prints loudly + to stderr at each milestone so the user can see what's happening. + """ + global _BOT_THREAD + if _BOT_THREAD is not None and _BOT_THREAD.is_alive(): + _stderr("start_bot() called but the bot is already running — skipping.") + return + token = _resolve_token() + if not token: + _stderr( + "BOT NOT STARTED — TELEGRAM_BOT_TOKEN not found in env, .env, or " + "api_key.txt. Paste your @BotFather token into one of those files." + ) + return + _stderr(f"starting bot (token ends in …{token[-6:]})") - self._busy = True - self._stop_event = threading.Event() + def _runner(): + import sys, traceback + try: + # Each thread needs its own asyncio event loop. Without this, the + # call to asyncio.Event() inside _run_bot_until_stopped fails. + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + tg_app = _build_telegram_app(token) + try: + loop.run_until_complete(_run_bot_until_stopped(tg_app)) + finally: + loop.close() + except Exception as e: + _stderr(f"BOT CRASHED: {e!r}") + traceback.print_exc(file=sys.stderr) + + _BOT_THREAD = threading.Thread(target=_runner, daemon=True, name="telegram-bot") + _BOT_THREAD.start() + + +def main(): + """Standalone entry — for testing without launching the full AutoUse app.""" + token = _resolve_token() + if not token: + raise SystemExit( + f"TELEGRAM_BOT_TOKEN not found in {_API_KEY_FILE}\n" + "(create the bot via @BotFather first, then add the token to that file)." + ) + tg_app = _build_telegram_app(token) + logger.info("Telegram bot polling started (main thread)") + tg_app.run_polling(allowed_updates=Update.ALL_TYPES) - await query.edit_message_text("🤔 Thinking...") - loop = asyncio.get_running_loop() - bot = context.bot - thread = threading.Thread( - target=self._run_agent, - args=(task, provider, model_id, api_key, chat_id, loop, bot), - daemon=True, - ) - thread.start() - - # ── public entry point ─────────────────────────────────────────────── - - def run(self): - """Start polling (blocking). Called from a thread by app.py.""" - app = Application.builder().token(self._token).build() - app.add_handler(CommandHandler("start", self.start_handler)) - app.add_handler(CommandHandler("stop", self.stop_handler)) - app.add_handler(CommandHandler("status", self.status_handler)) - app.add_handler(CallbackQueryHandler(self.callback_handler)) - app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, self.task_handler)) - - logger.info("Telegram bot polling started") - app.run_polling(allowed_updates=Update.ALL_TYPES) - - def stop(self): - """Signal any running agent to stop.""" - if self._stop_event: - self._stop_event.set() +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + ) + main() diff --git a/Auto_Use/windows_use/remote_connection/telegram/setup.py b/Auto_Use/windows_use/remote_connection/telegram/setup.py new file mode 100644 index 0000000..3edaba2 --- /dev/null +++ b/Auto_Use/windows_use/remote_connection/telegram/setup.py @@ -0,0 +1,203 @@ +# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# If you build on this project, please keep this header and credit +# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works. +# A small attribution goes a long way toward a healthy open-source +# community — thank you for contributing. + +"""Telegram remote-connection setup driver (Windows, guided mode). + +Opens Microsoft Edge, navigates to web.telegram.org, then lets the user log +in manually. Progress is paced by a small always-on-top banner that streams +status text and has a Next button. The script blocks on user clicks via +banner.wait_for_next() — the user does the actual login (phone, country, +OTP) themselves; we just get them to the right page. +""" +import logging +import os +import threading +import time + +from Auto_Use.windows_use.controller.tool.open_app import open_on_windows +from Auto_Use.windows_use.tree.element import UIElementScanner, ELEMENT_CONFIG +from Auto_Use.windows_use.controller.service import ControllerService +from Auto_Use.windows_use.controller.key_combo.service import KeyComboService +from Auto_Use.windows_use.remote_connection.telegram.banner import StatusBanner +from Auto_Use.windows_use.remote_connection.telegram.service import ( + _API_KEY_FILE, _set_key_in_file, +) + +logger = logging.getLogger(__name__) + +TELEGRAM_WEB_URL = "web.telegram.org" +STEP_DELAY_SEC = 2 + +# Singleton guard — /api/telegram/connect spawns a fresh daemon thread on +# every POST, so a rapid double-click or polling-induced re-fire would +# otherwise launch parallel banner wizards. We let the redundant calls +# return immediately while the first one runs to completion. +_SETUP_LOCK = threading.Lock() +_SETUP_ACTIVE = False + +# Edge candidates tried in order — `open_on_windows` does fuzzy matching, but +# different Windows installs surface Edge under slightly different Start-Menu +# entries (PWA shortcut vs. "Microsoft Edge.lnk" vs. plain `msedge.exe` on +# PATH). Try the cleanest one first; fall back to broader strings. +EDGE_NAME_CANDIDATES = ("msedge", "Microsoft Edge", "edge") + + +def _find_address_bar(mapping: dict) -> str | None: + """Return the index of Edge's address bar, or None if not found. + + On Edge the address bar surfaces in the UIA tree as + `name="Address and search bar", type="Edit"` — confirmed from a live + scan saved at debug/element/ui_elements_1778913911.txt:8. + """ + for idx, info in mapping.items(): + if info.get("name") == "Address and search bar" and info.get("type") == "Edit": + return idx + return None + + +def _launch_edge() -> bool: + """Try the Edge name variants in order; return True on the first success.""" + for name in EDGE_NAME_CANDIDATES: + try: + if open_on_windows(name): + return True + except Exception: + logger.warning("open_on_windows(%r) raised", name, exc_info=True) + return False + + +def _open_telegram_in_edge(banner) -> bool: + """Launch Edge and navigate it to web.telegram.org. + + Streams sub-step status to the banner so the user can see what's happening + while Edge takes focus. Returns False on any failure. + """ + banner.update("Please wait — confirming Edge is open…") + if not _launch_edge(): + logger.error("setup.py: failed to launch Microsoft Edge") + return False + # open_on_windows already sleeps a moment after launching, but the + # address bar isn't reliably populated in the UIA tree immediately — + # give Edge another beat to settle before we scan. + time.sleep(1) + + scanner = UIElementScanner(ELEMENT_CONFIG) + scanner.scan_elements() + mapping = scanner.get_elements_mapping() + time.sleep(STEP_DELAY_SEC) + + address_bar_index = _find_address_bar(mapping) + if address_bar_index is None: + logger.error("setup.py: Edge address bar not found in scan") + return False + + banner.update("Edge detected. Writing the URL for you, please wait…") + + controller = ControllerService() + controller.set_elements(mapping, scanner.application_name) + key_combo = KeyComboService() + + controller.click(address_bar_index) + time.sleep(STEP_DELAY_SEC) + + controller.canvas_input(TELEGRAM_WEB_URL) + time.sleep(STEP_DELAY_SEC) + + key_combo.send("return") + return True + + +def run(country_code: str = "", phone: str = "") -> bool: + """Guided Telegram-Web pairing. + + Shows a banner, waits for the user to click Next, opens Telegram Web, + waits for the user to log in manually + click Next, then closes. + + country_code and phone are accepted but ignored — kept only so the + pre-existing /api/telegram/connect callsite signature still works. + + Idempotent under concurrent calls: if a wizard is already running, + redundant invocations return False immediately so we don't end up + with N parallel banners in the taskbar. + """ + global _SETUP_ACTIVE + with _SETUP_LOCK: + if _SETUP_ACTIVE: + logger.info( + "setup.run: wizard already running — ignoring duplicate Connect" + ) + return False + _SETUP_ACTIVE = True + + banner = StatusBanner() + banner.show() + try: + banner.update("Let's get you set up with Telegram. Please click Next.") + if not banner.wait_for_next(): + return False + + if not _open_telegram_in_edge(banner): + banner.update("Failed to open Telegram. Close this banner and try again.") + banner.wait_for_next(timeout=15) + return False + + banner.update("Please log in to Telegram, then click Next") + if not banner.wait_for_next(): + return False + + banner.update( + "Now search for @BotFather in Telegram and open the chat. " + "Click Next when you're there." + ) + if not banner.wait_for_next(): + return False + + banner.update("How do you want to set up the bot?") + choice = banner.wait_for_choice("Fresh setup", "Token already generated") + + if choice == "left": + banner.update( + "In @BotFather, send these one at a time: /newbot → AutoUse → " + "a unique bot name. BotFather will reply with your token. " + "Click Next when you have it." + ) + if not banner.wait_for_next(): + return False + + banner.update("Paste your BotFather token below and click Save.") + token = banner.wait_for_input(save_label="Save") + if not token: + return False # banner never appeared or user closed it + + _set_key_in_file(_API_KEY_FILE, "TELEGRAM_BOT_TOKEN", token.strip()) + + banner.update("Saved. Restarting AutoUse to start the bot…") + # Give the message time to stream out + a beat for the user to read + # it, then hard-exit the whole process. The user's next `python + # app.py` boot picks up the fresh TELEGRAM_BOT_TOKEN and the bot + # comes online with the saved owner chat. os._exit skips atexit / + # finally cleanup, which is what we want — the tk loop will be torn + # down as the process dies. + time.sleep(3) + banner.close() + os._exit(0) + finally: + banner.close() + with _SETUP_LOCK: + _SETUP_ACTIVE = False diff --git a/Auto_Use/windows_use/remote_connection/telegram/view.py b/Auto_Use/windows_use/remote_connection/telegram/view.py index a21c13f..8f4e0c2 100644 --- a/Auto_Use/windows_use/remote_connection/telegram/view.py +++ b/Auto_Use/windows_use/remote_connection/telegram/view.py @@ -17,140 +17,141 @@ # A small attribution goes a long way toward a healthy open-source # community — thank you for contributing. -import threading +"""Flask Blueprint for the Windows Telegram surface. + +Mirror of the macOS view.py, adapted so app.py's single +`from ...view import telegram_bp, start_bot` works on Windows. Routes: + + GET /api/telegram/status → {connected, bot_username?} + POST /api/telegram/connect → kicks off the guided walkthrough (Edge) + POST /api/telegram/disconnect → clears the persisted token + +All token lookups read ONLY from api_key.txt. .env is intentionally not +consulted — the bot treats api_key.txt as its single source of truth. +""" +import json import logging -import socket -from pathlib import Path -from flask import Blueprint, jsonify, request, send_file +import threading +import urllib.request -logger = logging.getLogger(__name__) +from flask import Blueprint, jsonify -telegram_bp = Blueprint('telegram', __name__) +# Re-export start_bot so app.py's +# from Auto_Use.windows_use.remote_connection.telegram.view import telegram_bp, start_bot +# works from a single import line, matching app.py:921. +# _API_KEY_FILE comes from service.py too, which resolves it in a compiled- +# build-aware way (next to the executable when frozen) — one source of truth. +from .service import start_bot, _API_KEY_FILE # noqa: F401 + +logger = logging.getLogger(__name__) -_bot_instance = None -_bot_thread = None -_bot_username_cache = None +telegram_bp = Blueprint("telegram_windows", __name__) -# view.py -> telegram -> remote_connection -> windows_use -> Auto_Use / api_key / api_key.txt -API_KEY_FILE = Path(__file__).parent.parent.parent.parent / "api_key" / "api_key.txt" -PAIR_HTML = Path(__file__).parent / "pair.html" +_bot_username_cache: str | None = None -def _get_local_ip(): +def _read_token() -> str | None: + """Pull TELEGRAM_BOT_TOKEN out of api_key.txt. Returns None if missing or + empty. Does NOT consult .env or env vars on purpose.""" + if not _API_KEY_FILE.exists(): + return None try: - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(("8.8.8.8", 80)) - ip = s.getsockname()[0] - s.close() - return ip + with open(_API_KEY_FILE, "r", encoding="utf-8") as f: + for line in f: + stripped = line.strip() + if stripped.startswith("TELEGRAM_BOT_TOKEN="): + val = stripped.partition("=")[2].strip() + return val or None except Exception: - return "127.0.0.1" - - -def _read_telegram_token(): - if API_KEY_FILE.exists(): - try: - with open(API_KEY_FILE, 'r', encoding='utf-8') as f: - for line in f: - if line.strip().startswith('TELEGRAM_BOT_TOKEN='): - _, _, value = line.partition('=') - return value.strip() or None - except Exception: - pass + logger.warning("could not read %s", _API_KEY_FILE) return None -def _save_telegram_token(token): - API_KEY_FILE.parent.mkdir(parents=True, exist_ok=True) +def _set_token(value: str) -> None: + """Write/clear TELEGRAM_BOT_TOKEN= in api_key.txt, preserving every other + line (incl. empty-value placeholders the AutoUse UI relies on).""" lines = [] found = False - if API_KEY_FILE.exists(): - with open(API_KEY_FILE, 'r', encoding='utf-8') as f: - for line in f: - if line.strip().startswith('TELEGRAM_BOT_TOKEN='): - lines.append(f'TELEGRAM_BOT_TOKEN={token}\n') - found = True - else: - lines.append(line) + if _API_KEY_FILE.exists(): + try: + with open(_API_KEY_FILE, "r", encoding="utf-8") as f: + for raw in f: + if raw.strip().startswith("TELEGRAM_BOT_TOKEN="): + lines.append(f"TELEGRAM_BOT_TOKEN={value}\n") + found = True + else: + lines.append(raw if raw.endswith("\n") else raw + "\n") + except Exception: + logger.warning("could not read %s while updating token", _API_KEY_FILE) + return if not found: - lines.append(f'TELEGRAM_BOT_TOKEN={token}\n') - with open(API_KEY_FILE, 'w', encoding='utf-8') as f: - f.writelines(lines) + lines.append(f"TELEGRAM_BOT_TOKEN={value}\n") + try: + _API_KEY_FILE.parent.mkdir(parents=True, exist_ok=True) + with open(_API_KEY_FILE, "w", encoding="utf-8") as f: + f.writelines(lines) + except Exception: + logger.warning("could not write %s", _API_KEY_FILE) -def _fetch_bot_username(token): +def _fetch_bot_username(token: str) -> str | None: + """One-shot call to Telegram's getMe — used by /status so the panel can + show '@your_bot' instead of just 'connected'.""" try: - import urllib.request, json - resp = urllib.request.urlopen(f'https://api.telegram.org/bot{token}/getMe', timeout=5) + resp = urllib.request.urlopen( + f"https://api.telegram.org/bot{token}/getMe", timeout=5 + ) data = json.loads(resp.read()) - if data.get('ok'): - return data['result'].get('username', '') + if data.get("ok"): + return data["result"].get("username", "") or None except Exception: pass return None -def start_bot(): - global _bot_instance, _bot_thread, _bot_username_cache - if _bot_thread and _bot_thread.is_alive(): - return - token = _read_telegram_token() - if not token: - return - _bot_username_cache = _fetch_bot_username(token) - from .service import TelegramAgentBot - _bot_instance = TelegramAgentBot(token) - _bot_thread = threading.Thread(target=_bot_instance.run, daemon=True) - _bot_thread.start() - logger.info("Telegram bot started (@%s)", _bot_username_cache) - - -def stop_bot(): - global _bot_instance, _bot_thread, _bot_username_cache - if _bot_instance: - _bot_instance.stop() - _bot_instance = None - _bot_thread = None - _bot_username_cache = None - - -@telegram_bp.route('/pair') -def pair_page(): - return send_file(PAIR_HTML) - +# ── routes ────────────────────────────────────────────────────────────────── -@telegram_bp.route('/api/telegram/save-token', methods=['POST']) -def save_token(): - data = request.get_json() - token = (data.get('token') or '').strip() - if not token: - return jsonify({'error': 'No token provided'}), 400 - - username = _fetch_bot_username(token) - if not username: - return jsonify({'error': 'Invalid token — check and try again'}), 400 - - _save_telegram_token(token) - stop_bot() - start_bot() - return jsonify({'status': 'connected', 'bot_username': username}) - - -@telegram_bp.route('/api/telegram/status') +@telegram_bp.route("/api/telegram/status", methods=["GET"]) def telegram_status(): - token = _read_telegram_token() + """Frontend uses this to decide which Remote Connection panel state to + show. If a token is present in api_key.txt → 'connected', and the panel + flips to the @bot_username + Disconnect view (Connect button is hidden). + Cached so we don't hit Telegram's API on every page load.""" + global _bot_username_cache + token = _read_token() if not token: - return jsonify({'connected': False, 'local_ip': _get_local_ip()}) + _bot_username_cache = None + return jsonify({"connected": False}) + if _bot_username_cache is None: + _bot_username_cache = _fetch_bot_username(token) or "" return jsonify({ - 'connected': True, - 'bot_username': _bot_username_cache, - 'running': _bot_thread is not None and _bot_thread.is_alive(), - 'local_ip': _get_local_ip() + "connected": True, + "bot_username": _bot_username_cache, }) -@telegram_bp.route('/api/telegram/disconnect', methods=['POST']) -def disconnect(): - stop_bot() - _save_telegram_token('') - return jsonify({'status': 'disconnected'}) \ No newline at end of file +@telegram_bp.route("/api/telegram/connect", methods=["POST"]) +def telegram_connect(): + """Kick off the guided walkthrough (Edge → web.telegram.org → user logs + in manually, paced by the floating banner). Returns immediately; the real + work runs on a daemon thread since it blocks on user clicks.""" + try: + from Auto_Use.windows_use.remote_connection.telegram.setup import ( + run as run_telegram_setup, + ) + threading.Thread(target=run_telegram_setup, daemon=True).start() + return jsonify({"status": "started"}) + except Exception as e: + logger.exception("telegram_connect failed") + return jsonify({"status": "error", "message": str(e)}), 500 + + +@telegram_bp.route("/api/telegram/disconnect", methods=["POST"]) +def telegram_disconnect(): + """Clear the persisted token + the cached @bot_username. The polling + thread already running keeps polling until the next app restart (soft + disconnect) — clean shutdown of the bot loop is a future enhancement.""" + global _bot_username_cache + _set_token("") + _bot_username_cache = None + return jsonify({"status": "disconnected"}) diff --git a/README.md b/README.md index 8e32b67..d97dd1a 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,22 @@ --- +
+ + ## ⭐ Star History + + + + + + Star History Chart + + + +
+ +--- + ## ✨ Features - **GUI automation via hybrid Accessibility + Vision** — Quartz screen capture annotated with the macOS Accessibility tree, then handed to a multimodal LLM. Not pure pixel-guessing, not pure DOM-walking — both at once. diff --git a/app.py b/app.py index 8cd1d07..0aea3c8 100644 --- a/app.py +++ b/app.py @@ -70,6 +70,11 @@ # Check if running as compiled binary (Nuitka) IS_COMPILED = getattr(sys, 'frozen', False) or '__compiled__' in dir() IS_CLI_SUBPROCESS = "--cli-mode" in sys.argv +# Any re-exec of AutoUse.exe that should NOT overwrite the parent's debug log +# or wipe the parent's scratchpad. --banner-mode pops the floating Telegram +# pill (compiled-binary path — see banner.py:_IS_COMPILED branch). Treated +# identically to --cli-mode at the bootstrap-suppression layer below. +IS_SECONDARY_PROCESS = IS_CLI_SUBPROCESS or "--banner-mode" in sys.argv def app_data_dir() -> Path: @@ -121,8 +126,9 @@ def debug_exception(context): debug_log(f"EXCEPTION in {context}:", "ERROR") debug_log(traceback.format_exc(), "ERROR") -# Initialize log file on startup (only in compiled mode, not CLI subprocess) -if IS_COMPILED and not IS_CLI_SUBPROCESS and DEBUG_LOG_PATH: +# Initialize log file on startup (only in compiled mode, not in any +# secondary subprocess — those would clobber the parent's log on every spawn) +if IS_COMPILED and not IS_SECONDARY_PROCESS and DEBUG_LOG_PATH: try: with open(DEBUG_LOG_PATH, 'w', encoding='utf-8') as f: f.write(f"=== Auto Use Debug Log - Started {datetime.now()} ===\n") @@ -133,9 +139,60 @@ def debug_exception(context): except: pass +# ============================================================================= +# Banner subprocess stdio reconnection (MUST run before the std-fixup below) +# ============================================================================= +# When AutoUse.exe is re-exec'd as a banner subprocess via --banner-mode, the +# parent's subprocess.Popen wires fd 0 (stdin) and fd 1 (stdout) to the pipes +# it uses to drive the wizard. But the binary is built as a Windows +# GUI-subsystem app (--windows-console-mode=disable in windows_binary_build.py) +# which means Python startup sets sys.stdin/sys.stdout to None — even though +# the OS-level fds are valid pipe handles inherited from the parent. We have +# to wrap those fds as text streams here, BEFORE the `if sys.stdout is None` +# block below silently replaces stdin/stdout with /dev/null and permanently +# severs the JSON-stdio protocol with the parent. Without this, the parent +# never sees READY/NEXT/CHOICE/SAVE/CLOSED events, the subprocess's +# _stdin_reader crashes on `for line in None`, and the entire banner wizard +# auto-completes in milliseconds when the eventual subprocess crash unblocks +# every wait_for_* event in the parent at once. (Symptom: pill flashes for a +# few seconds, Edge opens, empty token gets persisted, AutoUse restarts.) +if "--banner-mode" in sys.argv: + try: + # line_buffering on stdin doesn't really matter (we're the reader), + # but the explicit encoding stops a UTF-8/cp1252 mismatch from + # silently dropping non-ASCII wizard text. + sys.stdin = os.fdopen(0, "r", encoding="utf-8", errors="replace") + except Exception: + pass + try: + # buffering=1 → line-buffered, so each `_emit()` JSON line reaches + # the parent immediately instead of sitting in a 4 KB block buffer. + sys.stdout = os.fdopen(1, "w", encoding="utf-8", errors="replace", buffering=1) + except Exception: + pass + if sys.stderr is None: + # sys.stderr is None in a Nuitka GUI-subsystem child. pywebview's + # webview/http.py has a self-heal shim, but it only runs after + # `import webview` — anything that writes to stderr before that + # (a stray print, an uncaught traceback) would crash the + # subprocess. Try the inherited fd 2; fall back to devnull so the + # attribute is never None. + try: + sys.stderr = os.fdopen(2, "w", encoding="utf-8", errors="replace", buffering=1) + except Exception: + try: + sys.stderr = open(os.devnull, "w", encoding="utf-8") + except Exception: + pass + # ============================================================================= # Fix for bundled app (MUST be before any print statements) # Skip when run from main.py / cli.py so terminal output is not buffered. +# Also skip in --banner-mode: the subprocess already wired its stdio above and +# re-wrapping orphans the original TextIOWrapper — its eventual GC closes +# fd 1 in the subprocess (silently breaking the JSON protocol with the parent +# after a few seconds), and the new wrapper also drops the line-buffering +# we deliberately set with buffering=1. # ============================================================================= def _entry_is_cli_script(): @@ -145,7 +202,7 @@ def _entry_is_cli_script(): main_file = getattr(sys.modules['__main__'], '__file__', None) or '' return os.path.basename(main_file) in ('main.py', 'cli.py') -if not _entry_is_cli_script(): +if not _entry_is_cli_script() and "--banner-mode" not in sys.argv: if sys.stdout is None: sys.stdout = open(os.devnull, 'w', encoding='utf-8') elif hasattr(sys.stdout, 'buffer'): @@ -372,8 +429,10 @@ def read_api_keys(): line = line.strip() if '=' in line: name, _, value = line.partition('=') - if name in keys: - keys[name] = value + # Keep every key, not just managed ones, so unmanaged + # entries (e.g. TELEGRAM_BOT_TOKEN, TELEGRAM_OWNER_CHAT_ID) + # survive a read-modify-write cycle instead of being dropped. + keys[name] = value except Exception: debug_exception("read_api_keys") return keys @@ -384,9 +443,15 @@ def write_api_keys(keys): try: key_file.parent.mkdir(parents=True, exist_ok=True) all_key_names = list(PROVIDER_KEY_MAP.values()) + EXTRA_KEYS + # Preserve any unmanaged keys (e.g. TELEGRAM_BOT_TOKEN, + # TELEGRAM_OWNER_CHAT_ID) that the Telegram surface writes — without + # this they'd be wiped every time a provider key is saved. + extra = [k for k in keys if k not in all_key_names] with open(key_file, 'w', encoding='utf-8') as f: for name in all_key_names: f.write(f"{name}={keys.get(name, '')}\n") + for name in extra: + f.write(f"{name}={keys.get(name, '')}\n") except Exception: debug_exception("write_api_keys") @@ -868,6 +933,20 @@ def start_server(): host = '0.0.0.0' if IS_WINDOWS else '127.0.0.1' app.run(host=host, port=5000, debug=False, use_reloader=False) +def minimize_main_window(): + """Minimise the AutoUse pywebview window. No-op if the window isn't up yet + (e.g. someone calls this before main() has created it) or pywebview's + minimise call fails for any reason. Safe to call from any thread — + pywebview routes the call to its own UI loop internally.""" + win = globals().get('webview_window') + if win is None: + return + try: + win.minimize() + except Exception: + debug_exception("minimize_main_window") + + def _compute_window_center(win_w, win_h): """Return (x, y) to center a (win_w, win_h) window on the main display. Falls back to a sensible default if the native APIs are unavailable.""" @@ -899,7 +978,31 @@ class RECT(ctypes.Structure): return 600, 30 def main(): - # Register Telegram blueprint on Windows (macOS doesn't ship it yet). + # --banner-mode MUST be handled before anything else in main() — Flask, + # webview, Telegram bot, scratchpad cleanup, etc. all need to stay + # untouched in the banner subprocess. In dev (`python app.py`) the + # banner spawns via `python -m …banner`, but the Nuitka binary has no + # `-m` mode, so StatusBanner.show() re-execs AutoUse.exe with this + # flag instead. Without an early exit here, the banner subprocess + # would boot a second AutoUse webview, start a second Telegram bot, + # and race the parent for port 5000 + the milestone scratchpad. We + # check at the very top so even one stray scratchpad wipe / Flask + # bind can't happen. --compact is left in argv on purpose — it's + # read inside _run_subprocess_banner via `"--compact" in sys.argv`. + if "--banner-mode" in sys.argv and IS_WINDOWS: + sys.argv.remove("--banner-mode") + try: + from Auto_Use.windows_use.remote_connection.telegram.banner import ( + _run_subprocess_banner, + ) + _run_subprocess_banner() + except Exception: + debug_exception("Banner mode") + return + + # Wire the Telegram remote-control bot. Windows mounts a Flask blueprint + # plus a polling bot; macOS just starts the polling bot (no blueprint yet — + # token is read from .env / api_key.txt directly). if IS_WINDOWS: try: from Auto_Use.windows_use.remote_connection.telegram.view import telegram_bp, start_bot @@ -907,6 +1010,17 @@ def main(): start_bot() except Exception: debug_exception("telegram_blueprint_init") + elif IS_MAC: + try: + from Auto_Use.macOS_use.remote_connection.telegram.view import telegram_bp + from Auto_Use.macOS_use.remote_connection.telegram.service import start_bot as start_telegram_bot + app.register_blueprint(telegram_bp) + start_telegram_bot() + except Exception as _tg_e: + import traceback as _tg_tb + print(f"[telegram] IMPORT/INIT FAILED: {_tg_e!r}", file=sys.stderr, flush=True) + _tg_tb.print_exc(file=sys.stderr) + debug_exception("telegram_bot_init") if "--cli-mode" in sys.argv: # CLI mode - delegate to the platform-specific CLI agent @@ -988,4 +1102,4 @@ def main(): main() except Exception: debug_exception("main entry point") - raise + raise \ No newline at end of file diff --git a/frontend/css/style.css b/frontend/css/style.css index d017fbe..ea99d34 100644 --- a/frontend/css/style.css +++ b/frontend/css/style.css @@ -896,27 +896,124 @@ body { padding: clamp(20px, 3vw, 32px) 0; } -.remote-qr-container { - width: 160px; - height: 160px; +.remote-instruction { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + font-size: clamp(12px, 1.2vw, 14px); + color: rgba(255, 255, 255, 0.45); + text-align: center; +} + +.remote-service-btn { display: flex; align-items: center; - justify-content: center; + gap: 12px; + width: 100%; + padding: 14px 16px; + background: rgba(255, 255, 255, 0.06); + border: 1px solid rgba(255, 255, 255, 0.1); border-radius: 14px; - background: rgba(255, 255, 255, 0.08); - padding: 12px; + color: rgba(255, 255, 255, 0.92); + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + font-size: clamp(13px, 1.3vw, 15px); + font-weight: 500; + cursor: pointer; + transition: background 0.2s ease, border-color 0.2s ease; } -.remote-qr-container img, -.remote-qr-container canvas { - border-radius: 6px; +.remote-service-btn:hover { + background: rgba(255, 255, 255, 0.1); + border-color: rgba(255, 255, 255, 0.18); } -.remote-instruction { +.remote-service-icon { + display: flex; + align-items: center; + justify-content: center; + color: rgba(80, 165, 230, 0.95); +} + +.remote-service-label { + flex: 1; + text-align: left; +} + +.remote-service-chevron { + color: rgba(255, 255, 255, 0.45); + display: flex; + align-items: center; +} + +.remote-phone-form { + display: flex; + flex-direction: column; + gap: 10px; + width: 100%; + margin-top: 4px; +} + +.remote-phone-row { + display: flex; + gap: 8px; +} + +.remote-country-select, +.remote-phone-input { + background: rgba(255, 255, 255, 0.06); + border: 1px solid rgba(255, 255, 255, 0.12); + border-radius: 10px; + color: rgba(255, 255, 255, 0.95); font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; font-size: clamp(12px, 1.2vw, 14px); - color: rgba(255, 255, 255, 0.45); - text-align: center; + padding: 9px 12px; + outline: none; + transition: border-color 0.2s ease, background 0.2s ease; +} + +.remote-country-select { + flex: 0 0 42%; + appearance: none; + -webkit-appearance: none; + background-image: url("data:image/svg+xml;utf8,"); + background-repeat: no-repeat; + background-position: right 10px center; + padding-right: 28px; +} + +.remote-country-select option { + background: #1a1a1a; + color: #fff; +} + +.remote-phone-input { + flex: 1; +} + +.remote-country-select:focus, +.remote-phone-input:focus { + border-color: rgba(80, 165, 230, 0.55); + background: rgba(255, 255, 255, 0.09); +} + +.remote-connect-btn { + padding: 10px 16px; + background: rgba(80, 165, 230, 0.85); + color: #fff; + border: none; + border-radius: 10px; + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + font-size: clamp(12px, 1.2vw, 14px); + font-weight: 600; + cursor: pointer; + transition: background 0.2s ease, opacity 0.2s ease; +} + +.remote-connect-btn:hover:not(:disabled) { + background: rgba(80, 165, 230, 1); +} + +.remote-connect-btn:disabled { + opacity: 0.45; + cursor: not-allowed; } .remote-connected { @@ -964,6 +1061,18 @@ body { color: rgba(255, 100, 100, 0.95); } +/* Delete-token button — brighter, solid red so it reads clearly as an action */ +#remoteDeleteTokenBtn { + border-color: rgba(235, 70, 70, 0.9); + background: rgba(235, 70, 70, 0.9); + color: #fff; +} + +#remoteDeleteTokenBtn:hover { + background: rgba(235, 70, 70, 1); + color: #fff; +} + .settings-provider-list { display: flex; flex-direction: column; @@ -2334,3 +2443,94 @@ body.cli-mode .settings-btn { loop of funny scraping/searching phrases (handled in script.js). No CSS overlay — globe icon + streamed phrases are the cue. ============================================================ */ + +/* ============================================================ + AutoUse helper info popup + Shown once when the user clicks Connect under Remote Connection → + Telegram. Tells them to look at the top-right banner. + ============================================================ */ +.telegram-prompt-overlay { + position: fixed; + inset: 0; + background: rgba(0, 0, 0, 0.3); + backdrop-filter: blur(8px); + -webkit-backdrop-filter: blur(8px); + z-index: 110; + + display: flex; + align-items: center; + justify-content: center; + + opacity: 0; + visibility: hidden; + transition: opacity 0.3s ease, visibility 0.3s ease; +} + +.telegram-prompt-overlay.active { + opacity: 1; + visibility: visible; +} + +.telegram-prompt-popup { + position: relative; + width: clamp(320px, 90vw, 420px); + padding: clamp(24px, 3vw, 32px); + border-radius: clamp(16px, 2vw, 24px); + + transform: scale(0.9) translateY(20px); + transition: transform 0.4s cubic-bezier(0.175, 0.885, 0.32, 1.275); + + box-shadow: 0 clamp(8px, 2vw, 16px) clamp(32px, 6vw, 64px) rgba(0, 0, 0, 0.2); +} + +.telegram-prompt-overlay.active .telegram-prompt-popup { + transform: scale(1) translateY(0); +} + +.telegram-prompt-content { + position: relative; + z-index: 3; + display: flex; + flex-direction: column; + align-items: center; + gap: clamp(14px, 1.8vw, 18px); + text-align: center; +} + +.telegram-prompt-title { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + font-size: clamp(18px, 2vw, 22px); + font-weight: 600; + color: rgba(255, 255, 255, 0.95); +} + +.telegram-prompt-message { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + font-size: clamp(13px, 1.3vw, 14px); + font-weight: 400; + color: rgba(255, 255, 255, 0.75); + line-height: 1.5; +} + +.telegram-prompt-message strong { + color: rgba(255, 255, 255, 0.95); + font-weight: 600; +} + +.telegram-prompt-ok { + margin-top: 4px; + padding: 10px 28px; + border: none; + border-radius: 10px; + background: rgba(80, 165, 230, 0.85); + color: #fff; + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + font-size: clamp(12px, 1.2vw, 14px); + font-weight: 600; + cursor: pointer; + transition: background 0.2s ease; +} + +.telegram-prompt-ok:hover { + background: rgba(80, 165, 230, 1); +} diff --git a/frontend/index.html b/frontend/index.html index f79890a..c46f0f0 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -373,10 +373,27 @@ Remote Connection - +
-
-
Scan with your phone to pair via Telegram
+ + + + +
@@ -392,6 +409,20 @@ + +
+
+
+
+
+
+
AutoUse helper
+
The helper will walk you through Telegram setup. Please click Next on the banner in the top-right corner of your screen to continue.
+ +
+
+
+
diff --git a/frontend/script.js b/frontend/script.js index 2f3efa6..7cf4bf2 100644 --- a/frontend/script.js +++ b/frontend/script.js @@ -605,39 +605,84 @@ document.addEventListener('DOMContentLoaded', () => { }); }); - // Remote Connection — QR + status logic + // Remote Connection — guided Telegram pairing const remoteSetup = document.getElementById('remoteSetup'); const remoteConnected = document.getElementById('remoteConnected'); - const remoteQrContainer = document.getElementById('remoteQrContainer'); const remoteBotName = document.getElementById('remoteBotName'); const remoteDisconnectBtn = document.getElementById('remoteDisconnectBtn'); + const remoteTelegramBtn = document.getElementById('remoteTelegramBtn'); + const remoteTelegramForm = document.getElementById('remoteTelegramForm'); + const remoteConnectBtn = document.getElementById('remoteConnectBtn'); + const remoteDeleteTokenBtn = document.getElementById('remoteDeleteTokenBtn'); + const remoteInstructions = document.getElementById('remoteInstructions'); + const telegramPromptOverlay = document.getElementById('telegramPromptOverlay'); + const telegramPromptOk = document.getElementById('telegramPromptOk'); function loadRemoteStatus() { fetch('/api/telegram/status') .then(res => res.json()) .then(data => { - if (data.connected && data.bot_username) { - remoteSetup.style.display = 'none'; - remoteConnected.style.display = 'flex'; - remoteBotName.textContent = '@' + data.bot_username; + // Always keep the Telegram service button visible and + // expandable. When already paired, just grey out the + // Connect button inside the form rather than swapping + // to a different panel. + remoteSetup.style.display = 'flex'; + if (remoteConnected) remoteConnected.style.display = 'none'; + + if (data.connected) { + if (remoteConnectBtn) { + remoteConnectBtn.disabled = true; + remoteConnectBtn.textContent = data.bot_username + ? '✓ Already paired (@' + data.bot_username + ')' + : '✓ Already paired'; + } + if (remoteDeleteTokenBtn) remoteDeleteTokenBtn.style.display = 'inline-block'; + if (remoteInstructions) remoteInstructions.style.display = 'none'; } else { - remoteSetup.style.display = 'flex'; - remoteConnected.style.display = 'none'; - const pairUrl = 'http://' + data.local_ip + ':5000/pair'; - remoteQrContainer.innerHTML = ''; - new QRCode(remoteQrContainer, { - text: pairUrl, - width: 160, - height: 160, - colorDark: '#ffffff', - colorLight: 'transparent', - correctLevel: QRCode.CorrectLevel.M - }); + if (remoteConnectBtn) { + remoteConnectBtn.disabled = false; + remoteConnectBtn.textContent = 'Connect'; + } + if (remoteDeleteTokenBtn) remoteDeleteTokenBtn.style.display = 'none'; + if (remoteTelegramForm) remoteTelegramForm.style.display = 'none'; + if (remoteInstructions) remoteInstructions.style.display = 'none'; } }) .catch(() => {}); } + if (remoteTelegramBtn) { + remoteTelegramBtn.addEventListener('click', () => { + if (!remoteTelegramForm) return; + const isHidden = remoteTelegramForm.style.display === 'none' || !remoteTelegramForm.style.display; + remoteTelegramForm.style.display = isHidden ? 'flex' : 'none'; + }); + } + + if (remoteConnectBtn) { + remoteConnectBtn.addEventListener('click', () => { + remoteConnectBtn.disabled = true; + if (telegramPromptOverlay) telegramPromptOverlay.classList.add('active'); + fetch('/api/telegram/connect', { method: 'POST' }) + .catch(() => {}) + .finally(() => { + remoteConnectBtn.disabled = false; + if (remoteInstructions) remoteInstructions.style.display = 'block'; + }); + }); + } + + if (telegramPromptOk && telegramPromptOverlay) { + telegramPromptOk.addEventListener('click', () => { + telegramPromptOverlay.classList.remove('active'); + }); + telegramPromptOverlay.addEventListener('click', (e) => { + if (e.target === telegramPromptOverlay) { + telegramPromptOverlay.classList.remove('active'); + } + }); + } + if (remoteDisconnectBtn) { remoteDisconnectBtn.addEventListener('click', () => { fetch('/api/telegram/disconnect', { method: 'POST' }) @@ -646,6 +691,16 @@ document.addEventListener('DOMContentLoaded', () => { }); } + if (remoteDeleteTokenBtn) { + remoteDeleteTokenBtn.addEventListener('click', () => { + remoteDeleteTokenBtn.disabled = true; + fetch('/api/telegram/disconnect', { method: 'POST' }) + .then(() => loadRemoteStatus()) + .catch(() => {}) + .finally(() => { remoteDeleteTokenBtn.disabled = false; }); + }); + } + settingsOverlay.addEventListener('click', (e) => { if (e.target === settingsOverlay) { resetSettingsToMenu(); diff --git a/mac_requirements.txt b/mac_requirements.txt index caab906..903e73f 100644 --- a/mac_requirements.txt +++ b/mac_requirements.txt @@ -27,6 +27,9 @@ mss flask psutil +# Remote Connection (Telegram bot) +python-telegram-bot + # Build Tools (Nuitka binary compilation) nuitka ordered-set diff --git a/main.py b/main.py index 036c156..c0dffe8 100644 --- a/main.py +++ b/main.py @@ -28,12 +28,12 @@ raise RuntimeError(f"Unsupported OS: {platform.system()}") # Configuration -PROVIDER = "openrouter" -MODEL = "gemini-3.1-pro" #refer to the model name correctly from model_list.txt. +PROVIDER = "local" +MODEL = "gemma4:e4b" #refer to the model name correctly from model_list.txt. # Your task here task = """ -study the the grep glob approach of macos then syndicate to windowsus use the directory is desktop/github/Auto use. +open safari """ diff --git a/windows_requirements.txt b/windows_requirements.txt index a09ebb6..cc2f635 100644 --- a/windows_requirements.txt +++ b/windows_requirements.txt @@ -26,6 +26,7 @@ keyboard mss flask==3.1.2 psutil +python-telegram-bot>=20.0 #ocr winrt-runtime @@ -34,4 +35,7 @@ winrt-Windows.Foundation.Collections winrt-Windows.Media.Ocr winrt-Windows.Globalization winrt-Windows.Graphics.Imaging -winrt-Windows.Storage.Streams \ No newline at end of file +winrt-Windows.Storage.Streams + +# Build / Packaging +nuitka \ No newline at end of file