Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,4 @@ mac_binary_build.py
*.icns
screen_record.py
/Auto_Use/macOS_use/scratchpad
/cli_minion_result
4 changes: 2 additions & 2 deletions Auto_Use/macOS_use/agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# community — thank you for contributing.

# Auto_Use/macOS_use/agent/__init__.py
from .service import AgentService
from .view import AgentResponseFormatter
from .main_driver.service import AgentService
from .main_driver.view import AgentResponseFormatter

__all__ = ['AgentService', 'AgentResponseFormatter']
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
This module allows the CLI agent to be run as a subprocess.

Usage:
python -m Auto_Use.macOS_use.agent.cli --task "your task here"
python -m Auto_Use.macOS_use.agent.coder --task "your task here"

Options:
--task : Required. The task for CLI agent to execute
Expand All @@ -37,7 +37,7 @@
- Result is written to --result file when done

When called directly for testing:
- Run: python -m Auto_Use.macOS_use.agent.cli --task "test task"
- Run: python -m Auto_Use.macOS_use.agent.coder --task "test task"
- Or use cli.py at project root
"""

Expand All @@ -61,8 +61,8 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python -m Auto_Use.macOS_use.agent.cli --task "fix the bug in test.py"
python -m Auto_Use.macOS_use.agent.cli --task "create hello world" --provider openrouter --model gemini-3.5-flash
python -m Auto_Use.macOS_use.agent.coder --task "fix the bug in test.py"
python -m Auto_Use.macOS_use.agent.coder --task "create hello world" --provider openrouter --model gemini-3.5-flash
"""
)

Expand Down
24 changes: 24 additions & 0 deletions Auto_Use/macOS_use/agent/main_driver/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# If you build on this project, please keep this header and credit
# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
# A small attribution goes a long way toward a healthy open-source
# community — thank you for contributing.

# Auto_Use/macOS_use/agent/main_driver/__init__.py
from .service import AgentService
from .view import AgentResponseFormatter

__all__ = ['AgentService', 'AgentResponseFormatter']
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
from pathlib import Path
from datetime import datetime
from typing import Optional
from ..llm_provider.llm_manager import LLMManager
from ...llm_provider.llm_manager import LLMManager
from .view import AgentResponseFormatter
from ..tree.element import UIElementScanner, ELEMENT_CONFIG
from ..controller import ControllerView
from .domain_knowledge import DomainKnowledgeService
from ...tree.element import UIElementScanner, ELEMENT_CONFIG
from ...controller import ControllerView
from ..skills import DomainKnowledgeService
from PIL import Image
from io import BytesIO

Expand Down Expand Up @@ -111,7 +111,7 @@ def __init__(self, provider: str, model: str, save_conversation: bool = False, t
self.controller = ControllerView(provider=provider, model=self.llm_manager.get_model_name(), web_callback=web_callback, shell_callback=shell_callback, cli_callback=cli_callback, api_key=api_key, stop_event=stop_event, external_terminal=external_terminal)

# Initialize Domain Knowledge Service
self.domain_knowledge = DomainKnowledgeService()
self.skills = DomainKnowledgeService()

# Save conversation flag
self.save_conversation = save_conversation
Expand Down Expand Up @@ -384,7 +384,7 @@ def process_request(self, task: str) -> str:
formatted_element_tree = f"<element_tree>\n{element_tree_text}\n</element_tree>"

# Fetch domain-specific knowledge if available
domain_block = self.domain_knowledge.get_knowledge(
domain_block = self.skills.get_knowledge(
self.scanner.application_name,
element_tree_text
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,8 @@ Each step includes:
3. [ID] is displayed at the top-left corner of the element it belongs to.
</os_vision>
<blocks>
1. Each output must contain the following blocks.
2. These blocks build on one another as progress is made.
3. Output blocks: `thinking`, `verdict_last_action`, `decision`, `memory`, `current_goal`, and `action`.
1. Each output builds on the last; produce every block in order.
2. Blocks: `thinking`, `verdict_last_action`, `decision`, `memory`, `current_goal`, `action`.
<thinking>
1. You have thinking capability before jumping to any conclusion. You must follow the <reasoning_rules> at each step.
2. Max 150 words. Keep to 3-5 sentences max. No repeating, no second-guessing.
Expand Down Expand Up @@ -194,15 +193,14 @@ Each step includes:
2. Negative: `"verdict_last_action": "Based on <os_vision>: still on Home after clicking Downloads; id 100 path shows Home. <last_response>: PASS, but left_click did not register. Verdict: FAIL."`
</verdict_last_action>
<decision>
*The final synthesis of your thinking — bridge between reasoning and action.*
1. After reasoning through the screenshot and element_tree in your thinking block, distill your conclusion here in 2–3 concise lines.
2. Line 1: Focused app/window and its current state.
3. Line 2: Finalized actions (with IDs or tools).
4. Line 3: Why — the reasoning behind this decision and any recovery if applicable.
5. Format: "decision": "<App/Window>; <State>.\nFinalized: <Actions/Tools with IDs>.\nReason: <why this decision was taken + recovery if any>."
6. Examples:
1. "decision": "Safari - Gmail Compose; To/Subject/Body fields loaded.\nFinalized: input id 12 (To), input id 15 (Subject), input id 20 (Body).\nReason: All compose fields visible and aligned, filling in sequence to complete email draft."
2. "decision": "Finder; Downloads folder open with target file visible.\nFinalized: left_click 2 times on id 33.\nReason: File is fully visible and aligned, opening it to verify contents before marking todo complete."
*Commit step: lock the exact surface, ids/tools, and rationale before emitting `action`.*
1. Line 1: Active app/window + its current state.
2. Line 2: Exact ids/tools you will act on (each must exist in <element_tree>).
3. Line 3: Why this is correct; if last verdict was FAIL, state the recovery.
4. Format: "decision": "<App/Window>; <State>.\nFinalized: <Actions/Tools with IDs>.\nReason: <why + recovery if FAIL>."
5. Examples:
1. "decision": "Safari - Gmail Compose; To/Subject/Body fields loaded.\nFinalized: input id 12 (To), input id 15 (Subject), input id 20 (Body).\nReason: Fields visible and aligned, filling in sequence to complete the draft."
2. "decision": "Finder; still on Home, last Downloads click did not register.\nFinalized: left_click id 18 (Downloads, sidebar).\nReason: Verdict FAIL on toolbar item; retrying via the stable sidebar target id 18."
</decision>
<current_goal>
# Rule: align with the top pending ToDo item.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
# community — thank you for contributing.

# Minion sub-agent package.
# Mirrors the cli/ package structure:
# Mirrors the coder/ package structure:
# - service.py : full agent loop (read-only scout variant)
# - view.py : MinionResponseFormatter (next_goal-shape JSON validator)
# - __main__.py : subprocess entry — `python -m ...agent.cli.minions`
# - __main__.py : subprocess entry — `python -m ...agent.minions`
# - system_prompt.md : read-only scout system prompt

from .service import AgentService
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
Subprocess entry for the read-only scout minion.

Usage:
python -m Auto_Use.macOS_use.agent.cli.minions --task "your question here"
python -m Auto_Use.macOS_use.agent.minions --task "your question here"

Options:
--task : Required. The question/objective for the minion to answer.
Expand All @@ -38,7 +38,7 @@
parent CLI agent as a <minion_completed> tool response.

When called directly for testing:
python -m Auto_Use.macOS_use.agent.cli.minions --task "where is X defined?"
python -m Auto_Use.macOS_use.agent.minions --task "where is X defined?"
"""

import argparse
Expand All @@ -60,8 +60,8 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python -m Auto_Use.macOS_use.agent.cli.minions --task "where is _read_scratchpad_from_file defined and who calls it?"
python -m Auto_Use.macOS_use.agent.cli.minions --task "list every file under src/ that imports requests"
python -m Auto_Use.macOS_use.agent.minions --task "where is _read_scratchpad_from_file defined and who calls it?"
python -m Auto_Use.macOS_use.agent.minions --task "list every file under src/ that imports requests"
"""
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
from typing import Optional
import threading

from ....llm_provider.llm_manager import LLMManager
from ....controller.view import ControllerView
from ...llm_provider.llm_manager import LLMManager
from ...controller.view import ControllerView
from .view import MinionResponseFormatter

try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,17 @@ def __init__(self):
self.browser_keywords = ["chrome", "firefox", "edge", "opera", "brave", "safari", "vivaldi", "browser"]

def _load_mappings(self) -> dict:
"""Load domain_knowledge.json mapping file"""
"""Load skills.json mapping file"""
try:
json_path = os.path.join(self.current_dir, "domain_knowledge.json")
json_path = os.path.join(self.current_dir, "skills.json")
if os.path.exists(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
return json.load(f)
else:
logger.warning("domain_knowledge.json not found")
logger.warning("skills.json not found")
return {"browser": {}, "os": {}}
except Exception as e:
logger.error(f"Error loading domain_knowledge.json: {str(e)}")
logger.error(f"Error loading skills.json: {str(e)}")
return {"browser": {}, "os": {}}

def _is_browser(self, application_name: str) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions Auto_Use/macOS_use/controller/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def route_action(self, action_data):
]
else:
cli_cmd = [
sys.executable, "-m", "Auto_Use.macOS_use.agent.cli",
sys.executable, "-m", "Auto_Use.macOS_use.agent.coder",
"--task", task_description,
"--provider", self.provider,
"--model", self.model,
Expand Down Expand Up @@ -749,7 +749,7 @@ def watch_cli_result(rf=result_file):
]
else:
cli_cmd = [
sys.executable, "-m", "Auto_Use.macOS_use.agent.cli.minions",
sys.executable, "-m", "Auto_Use.macOS_use.agent.minions",
"--task", minion_query,
"--provider", self.provider,
"--model", self.model,
Expand Down
97 changes: 54 additions & 43 deletions Auto_Use/macOS_use/llm_provider/llm_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# A small attribution goes a long way toward a healthy open-source
# community — thank you for contributing.

import copy
import os
import time
from typing import Optional
Expand Down Expand Up @@ -439,55 +440,65 @@ def _initialize_provider(self):
raise ValueError(f"Unsupported provider: {self.provider}")

def send_request(self, messages: list, annotated_screenshot_base64: Optional[str] = None):
"""Send request to the selected provider"""
# Retry up to 3 times with 1 second delay
"""Send request to the selected provider with idempotent retries."""
last_error = None
for attempt in range(3):
# Providers may mutate messages in-place (e.g. wrapping the last user
# message into multimodal content blocks); deep-copy per attempt so
# those mutations cannot compound across retries.
attempt_messages = copy.deepcopy(messages)
try:
response = self.provider_instance.send_request(messages, self.model, annotated_screenshot_base64)

# Extract the assistant's response
response = self.provider_instance.send_request(
attempt_messages, self.model, annotated_screenshot_base64
)
return response['choices'][0]['message']['content']
except Exception as e:
if attempt < 2: # If not the last attempt
print(f"⚠️ API request failed (attempt {attempt + 1}/3), retrying in 1 second...")
last_error = e
if attempt < 2:
print(f"⚠️ API request failed (attempt {attempt + 1}/3): {e}")
print(" Retrying in 1 second with a fresh message copy...")
time.sleep(1)
continue
else:
# CLI agent: seamless fallback to secondary model (never die)
if self.cli_agent and hasattr(self, '_cli_fallback_model') and self._cli_fallback_model:
print(f"⚠️ CLI Agent: {self.display_name} failed after 3 attempts. Switching to fallback...")
# Resolve fallback model info (same provider, different model)
if self.provider == "openrouter":
model_info = get_openrouter_model_info(self._cli_fallback_model)
elif self.provider == "groq":
model_info = get_groq_model_info(self._cli_fallback_model)
elif self.provider == "openai":
model_info = get_openai_model_info(self._cli_fallback_model)
elif self.provider == "anthropic":
model_info = get_anthropic_model_info(self._cli_fallback_model)
elif self.provider == "google":
model_info = get_google_model_info(self._cli_fallback_model)
elif self.provider == "perplexity":
model_info = get_perplexity_model_info(self._cli_fallback_model)
else:
raise e
# Hot-swap model (provider stays the same, no re-init needed)
self.model = model_info["api_name"]
self.has_vision = model_info["vision"]
self.display_name = model_info["display_name"]
self.model_info = model_info
# Clear fallback so we don't loop forever
self._cli_fallback_model = None
print(f"✅ CLI Agent: Now using {self.display_name}")
# Retry with fallback (same messages, full history intact)
try:
response = self.provider_instance.send_request(messages, self.model, annotated_screenshot_base64)
return response['choices'][0]['message']['content']
except Exception as fallback_e:
print(f"❌ CLI Agent: Fallback {self.display_name} also failed: {fallback_e}")
raise fallback_e
else:
raise e
print(f"❌ API request failed after 3 attempts: {e}")
break

# All 3 attempts failed. CLI agent: seamless fallback to secondary model (never die)
if self.cli_agent and hasattr(self, '_cli_fallback_model') and self._cli_fallback_model:
print(f"⚠️ CLI Agent: {self.display_name} failed after 3 attempts. Switching to fallback...")
# Resolve fallback model info (same provider, different model)
if self.provider == "openrouter":
model_info = get_openrouter_model_info(self._cli_fallback_model)
elif self.provider == "groq":
model_info = get_groq_model_info(self._cli_fallback_model)
elif self.provider == "openai":
model_info = get_openai_model_info(self._cli_fallback_model)
elif self.provider == "anthropic":
model_info = get_anthropic_model_info(self._cli_fallback_model)
elif self.provider == "google":
model_info = get_google_model_info(self._cli_fallback_model)
elif self.provider == "perplexity":
model_info = get_perplexity_model_info(self._cli_fallback_model)
else:
raise last_error
# Hot-swap model (provider stays the same, no re-init needed)
self.model = model_info["api_name"]
self.has_vision = model_info["vision"]
self.display_name = model_info["display_name"]
self.model_info = model_info
# Clear fallback so we don't loop forever
self._cli_fallback_model = None
print(f"✅ CLI Agent: Now using {self.display_name}")
# Retry with fallback (fresh copy, full history intact)
try:
response = self.provider_instance.send_request(
copy.deepcopy(messages), self.model, annotated_screenshot_base64
)
return response['choices'][0]['message']['content']
except Exception as fallback_e:
print(f"❌ CLI Agent: Fallback {self.display_name} also failed: {fallback_e}")
raise fallback_e
else:
raise last_error

def get_model_name(self) -> str:
"""Get the current model short name (preserves vertex suffix for downstream routing)"""
Expand Down
14 changes: 14 additions & 0 deletions Auto_Use/macOS_use/llm_provider/openai/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@
"display_name": "GPT-5.4",
"reasoning_support": True,
"json_mode": True
},
"gpt-5.5": {
"api_name": "gpt-5.5",
"vision": True,
"display_name": "GPT-5.5",
"reasoning_support": True,
"json_mode": True
},
"gpt-5.5-pro": {
"api_name": "gpt-5.5-pro",
"vision": True,
"display_name": "GPT-5.5 Pro",
"reasoning_support": True,
"json_mode": True
}
}

Expand Down
Loading
Loading