auto-use · FunctionFreak · May 23, 2026 · May 23, 2026 · May 25, 2026 · May 25, 2026
@@ -112,3 +112,4 @@ mac_binary_build.py
 *.icns
 screen_record.py
 /Auto_Use/macOS_use/scratchpad
+/cli_minion_result
@@ -18,7 +18,7 @@
 # community — thank you for contributing.
 
 # Auto_Use/macOS_use/agent/__init__.py
-from .service import AgentService
-from .view import AgentResponseFormatter
+from .main_driver.service import AgentService
+from .main_driver.view import AgentResponseFormatter
 
 __all__ = ['AgentService', 'AgentResponseFormatter']
@@ -23,7 +23,7 @@
 This module allows the CLI agent to be run as a subprocess.
 
 Usage:
-    python -m Auto_Use.macOS_use.agent.cli --task "your task here"
+    python -m Auto_Use.macOS_use.agent.coder --task "your task here"
 
     Options:
         --task      : Required. The task for CLI agent to execute
@@ -37,7 +37,7 @@
     - Result is written to --result file when done
 
 When called directly for testing:
-    - Run: python -m Auto_Use.macOS_use.agent.cli --task "test task"
+    - Run: python -m Auto_Use.macOS_use.agent.coder --task "test task"
     - Or use cli.py at project root
 """
 
@@ -61,8 +61,8 @@ def main():
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-    python -m Auto_Use.macOS_use.agent.cli --task "fix the bug in test.py"
-    python -m Auto_Use.macOS_use.agent.cli --task "create hello world" --provider openrouter --model gemini-3.5-flash
+    python -m Auto_Use.macOS_use.agent.coder --task "fix the bug in test.py"
+    python -m Auto_Use.macOS_use.agent.coder --task "create hello world" --provider openrouter --model gemini-3.5-flash
         """
     )
 

@@ -0,0 +1,24 @@
+# Copyright 2026 Autouse AI — https://github.com/auto-use/Auto-Use
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# If you build on this project, please keep this header and credit
+# Autouse AI (https://github.com/auto-use/Auto-Use) in forks and derivative works.
+# A small attribution goes a long way toward a healthy open-source
+# community — thank you for contributing.
+
+# Auto_Use/macOS_use/agent/main_driver/__init__.py
+from .service import AgentService
+from .view import AgentResponseFormatter
+
+__all__ = ['AgentService', 'AgentResponseFormatter']
@@ -26,11 +26,11 @@
 from pathlib import Path
 from datetime import datetime
 from typing import Optional
-from ..llm_provider.llm_manager import LLMManager
+from ...llm_provider.llm_manager import LLMManager
 from .view import AgentResponseFormatter
-from ..tree.element import UIElementScanner, ELEMENT_CONFIG
-from ..controller import ControllerView
-from .domain_knowledge import DomainKnowledgeService
+from ...tree.element import UIElementScanner, ELEMENT_CONFIG
+from ...controller import ControllerView
+from ..skills import DomainKnowledgeService
 from PIL import Image
 from io import BytesIO
 
@@ -111,7 +111,7 @@ def __init__(self, provider: str, model: str, save_conversation: bool = False, t
         self.controller = ControllerView(provider=provider, model=self.llm_manager.get_model_name(), web_callback=web_callback, shell_callback=shell_callback, cli_callback=cli_callback, api_key=api_key, stop_event=stop_event, external_terminal=external_terminal)
 
         # Initialize Domain Knowledge Service
-        self.domain_knowledge = DomainKnowledgeService()
+        self.skills = DomainKnowledgeService()
 
         # Save conversation flag
         self.save_conversation = save_conversation
@@ -384,7 +384,7 @@ def process_request(self, task: str) -> str:
                 formatted_element_tree = f"<element_tree>\n{element_tree_text}\n</element_tree>"
 
                 # Fetch domain-specific knowledge if available
-                domain_block = self.domain_knowledge.get_knowledge(
+                domain_block = self.skills.get_knowledge(
                     self.scanner.application_name,
                     element_tree_text
                 )

@@ -151,9 +151,8 @@ Each step includes:
 3. [ID] is displayed at the top-left corner of the element it belongs to.
 </os_vision>
 <blocks>  
-1. Each output must contain the following blocks.  
-2. These blocks build on one another as progress is made.  
-3. Output blocks: `thinking`, `verdict_last_action`, `decision`, `memory`, `current_goal`, and `action`.
+1. Each output builds on the last; produce every block in order.
+2. Blocks: `thinking`, `verdict_last_action`, `decision`, `memory`, `current_goal`, `action`.
 <thinking>  
 1. You have thinking capability before jumping to any conclusion. You must follow the <reasoning_rules> at each step.
 2. Max 150 words. Keep to 3-5 sentences max. No repeating, no second-guessing.
@@ -194,15 +193,14 @@ Each step includes:
   2. Negative: `"verdict_last_action": "Based on <os_vision>: still on Home after clicking Downloads; id 100 path shows Home. <last_response>: PASS, but left_click did not register. Verdict: FAIL."`
 </verdict_last_action>
 <decision>
-*The final synthesis of your thinking — bridge between reasoning and action.*
-1. After reasoning through the screenshot and element_tree in your thinking block, distill your conclusion here in 2–3 concise lines.
-2. Line 1: Focused app/window and its current state.
-3. Line 2: Finalized actions (with IDs or tools).
-4. Line 3: Why — the reasoning behind this decision and any recovery if applicable.
-5. Format: "decision": "<App/Window>; <State>.\nFinalized: <Actions/Tools with IDs>.\nReason: <why this decision was taken + recovery if any>."
-6. Examples:
-  1. "decision": "Safari - Gmail Compose; To/Subject/Body fields loaded.\nFinalized: input id 12 (To), input id 15 (Subject), input id 20 (Body).\nReason: All compose fields visible and aligned, filling in sequence to complete email draft."
-  2. "decision": "Finder; Downloads folder open with target file visible.\nFinalized: left_click 2 times on id 33.\nReason: File is fully visible and aligned, opening it to verify contents before marking todo complete."
+*Commit step: lock the exact surface, ids/tools, and rationale before emitting `action`.*
+1. Line 1: Active app/window + its current state.
+2. Line 2: Exact ids/tools you will act on (each must exist in <element_tree>).
+3. Line 3: Why this is correct; if last verdict was FAIL, state the recovery.
+4. Format: "decision": "<App/Window>; <State>.\nFinalized: <Actions/Tools with IDs>.\nReason: <why + recovery if FAIL>."
+5. Examples:
+  1. "decision": "Safari - Gmail Compose; To/Subject/Body fields loaded.\nFinalized: input id 12 (To), input id 15 (Subject), input id 20 (Body).\nReason: Fields visible and aligned, filling in sequence to complete the draft."
+  2. "decision": "Finder; still on Home, last Downloads click did not register.\nFinalized: left_click id 18 (Downloads, sidebar).\nReason: Verdict FAIL on toolbar item; retrying via the stable sidebar target id 18."
 </decision>
 <current_goal>
 # Rule: align with the top pending ToDo item.

@@ -18,10 +18,10 @@
 # community — thank you for contributing.
 
 # Minion sub-agent package.
-# Mirrors the cli/ package structure:
+# Mirrors the coder/ package structure:
 #   - service.py        : full agent loop (read-only scout variant)
 #   - view.py           : MinionResponseFormatter (next_goal-shape JSON validator)
-#   - __main__.py       : subprocess entry — `python -m ...agent.cli.minions`
+#   - __main__.py       : subprocess entry — `python -m ...agent.minions`
 #   - system_prompt.md  : read-only scout system prompt
 
 from .service import AgentService

@@ -23,7 +23,7 @@
 Subprocess entry for the read-only scout minion.
 
 Usage:
-    python -m Auto_Use.macOS_use.agent.cli.minions --task "your question here"
+    python -m Auto_Use.macOS_use.agent.minions --task "your question here"
 
     Options:
         --task      : Required. The question/objective for the minion to answer.
@@ -38,7 +38,7 @@
       parent CLI agent as a <minion_completed> tool response.
 
 When called directly for testing:
-    python -m Auto_Use.macOS_use.agent.cli.minions --task "where is X defined?"
+    python -m Auto_Use.macOS_use.agent.minions --task "where is X defined?"
 """
 
 import argparse
@@ -60,8 +60,8 @@ def main():
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
-    python -m Auto_Use.macOS_use.agent.cli.minions --task "where is _read_scratchpad_from_file defined and who calls it?"
-    python -m Auto_Use.macOS_use.agent.cli.minions --task "list every file under src/ that imports requests"
+    python -m Auto_Use.macOS_use.agent.minions --task "where is _read_scratchpad_from_file defined and who calls it?"
+    python -m Auto_Use.macOS_use.agent.minions --task "list every file under src/ that imports requests"
         """
     )
 

@@ -44,8 +44,8 @@
 from typing import Optional
 import threading
 
-from ....llm_provider.llm_manager import LLMManager
-from ....controller.view import ControllerView
+from ...llm_provider.llm_manager import LLMManager
+from ...controller.view import ControllerView
 from .view import MinionResponseFormatter
 
 try:

@@ -36,17 +36,17 @@ def __init__(self):
         self.browser_keywords = ["chrome", "firefox", "edge", "opera", "brave", "safari", "vivaldi", "browser"]
 
     def _load_mappings(self) -> dict:
-        """Load domain_knowledge.json mapping file"""
+        """Load skills.json mapping file"""
         try:
-            json_path = os.path.join(self.current_dir, "domain_knowledge.json")
+            json_path = os.path.join(self.current_dir, "skills.json")
             if os.path.exists(json_path):
                 with open(json_path, 'r', encoding='utf-8') as f:
                     return json.load(f)
             else:
-                logger.warning("domain_knowledge.json not found")
+                logger.warning("skills.json not found")
                 return {"browser": {}, "os": {}}
         except Exception as e:
-            logger.error(f"Error loading domain_knowledge.json: {str(e)}")
+            logger.error(f"Error loading skills.json: {str(e)}")
             return {"browser": {}, "os": {}}
 
     def _is_browser(self, application_name: str) -> bool:

@@ -627,7 +627,7 @@ def route_action(self, action_data):
                         ]
                     else:
                         cli_cmd = [
-                            sys.executable, "-m", "Auto_Use.macOS_use.agent.cli",
+                            sys.executable, "-m", "Auto_Use.macOS_use.agent.coder",
                             "--task", task_description,
                             "--provider", self.provider,
                             "--model", self.model,
@@ -749,7 +749,7 @@ def watch_cli_result(rf=result_file):
                         ]
                     else:
                         cli_cmd = [
-                            sys.executable, "-m", "Auto_Use.macOS_use.agent.cli.minions",
+                            sys.executable, "-m", "Auto_Use.macOS_use.agent.minions",
                             "--task", minion_query,
                             "--provider", self.provider,
                             "--model", self.model,

@@ -17,6 +17,7 @@
 # A small attribution goes a long way toward a healthy open-source
 # community — thank you for contributing.
 
+import copy
 import os
 import time
 from typing import Optional
@@ -439,55 +440,65 @@ def _initialize_provider(self):
             raise ValueError(f"Unsupported provider: {self.provider}")
 
     def send_request(self, messages: list, annotated_screenshot_base64: Optional[str] = None):
-        """Send request to the selected provider"""
-        # Retry up to 3 times with 1 second delay
+        """Send request to the selected provider with idempotent retries."""
+        last_error = None
         for attempt in range(3):
+            # Providers may mutate messages in-place (e.g. wrapping the last user
+            # message into multimodal content blocks); deep-copy per attempt so
+            # those mutations cannot compound across retries.
+            attempt_messages = copy.deepcopy(messages)
             try:
-                response = self.provider_instance.send_request(messages, self.model, annotated_screenshot_base64)
-
-                # Extract the assistant's response
+                response = self.provider_instance.send_request(
+                    attempt_messages, self.model, annotated_screenshot_base64
+                )
                 return response['choices'][0]['message']['content']
             except Exception as e:
-                if attempt < 2:  # If not the last attempt
-                    print(f"⚠️ API request failed (attempt {attempt + 1}/3), retrying in 1 second...")
+                last_error = e
+                if attempt < 2:
+                    print(f"⚠️ API request failed (attempt {attempt + 1}/3): {e}")
+                    print("   Retrying in 1 second with a fresh message copy...")
                     time.sleep(1)
                     continue
-                else:
-                    # CLI agent: seamless fallback to secondary model (never die)
-                    if self.cli_agent and hasattr(self, '_cli_fallback_model') and self._cli_fallback_model:
-                        print(f"⚠️ CLI Agent: {self.display_name} failed after 3 attempts. Switching to fallback...")
-                        # Resolve fallback model info (same provider, different model)
-                        if self.provider == "openrouter":
-                            model_info = get_openrouter_model_info(self._cli_fallback_model)
-                        elif self.provider == "groq":
-                            model_info = get_groq_model_info(self._cli_fallback_model)
-                        elif self.provider == "openai":
-                            model_info = get_openai_model_info(self._cli_fallback_model)
-                        elif self.provider == "anthropic":
-                            model_info = get_anthropic_model_info(self._cli_fallback_model)
-                        elif self.provider == "google":
-                            model_info = get_google_model_info(self._cli_fallback_model)
-                        elif self.provider == "perplexity":
-                            model_info = get_perplexity_model_info(self._cli_fallback_model)
-                        else:
-                            raise e
-                        # Hot-swap model (provider stays the same, no re-init needed)
-                        self.model = model_info["api_name"]
-                        self.has_vision = model_info["vision"]
-                        self.display_name = model_info["display_name"]
-                        self.model_info = model_info
-                        # Clear fallback so we don't loop forever
-                        self._cli_fallback_model = None
-                        print(f"✅ CLI Agent: Now using {self.display_name}")
-                        # Retry with fallback (same messages, full history intact)
-                        try:
-                            response = self.provider_instance.send_request(messages, self.model, annotated_screenshot_base64)
-                            return response['choices'][0]['message']['content']
-                        except Exception as fallback_e:
-                            print(f"❌ CLI Agent: Fallback {self.display_name} also failed: {fallback_e}")
-                            raise fallback_e
-                    else:
-                        raise e
+                print(f"❌ API request failed after 3 attempts: {e}")
+                break
+
+        # All 3 attempts failed. CLI agent: seamless fallback to secondary model (never die)
+        if self.cli_agent and hasattr(self, '_cli_fallback_model') and self._cli_fallback_model:
+            print(f"⚠️ CLI Agent: {self.display_name} failed after 3 attempts. Switching to fallback...")
+            # Resolve fallback model info (same provider, different model)
+            if self.provider == "openrouter":
+                model_info = get_openrouter_model_info(self._cli_fallback_model)
+            elif self.provider == "groq":
+                model_info = get_groq_model_info(self._cli_fallback_model)
+            elif self.provider == "openai":
+                model_info = get_openai_model_info(self._cli_fallback_model)
+            elif self.provider == "anthropic":
+                model_info = get_anthropic_model_info(self._cli_fallback_model)
+            elif self.provider == "google":
+                model_info = get_google_model_info(self._cli_fallback_model)
+            elif self.provider == "perplexity":
+                model_info = get_perplexity_model_info(self._cli_fallback_model)
+            else:
+                raise last_error
+            # Hot-swap model (provider stays the same, no re-init needed)
+            self.model = model_info["api_name"]
+            self.has_vision = model_info["vision"]
+            self.display_name = model_info["display_name"]
+            self.model_info = model_info
+            # Clear fallback so we don't loop forever
+            self._cli_fallback_model = None
+            print(f"✅ CLI Agent: Now using {self.display_name}")
+            # Retry with fallback (fresh copy, full history intact)
+            try:
+                response = self.provider_instance.send_request(
+                    copy.deepcopy(messages), self.model, annotated_screenshot_base64
+                )
+                return response['choices'][0]['message']['content']
+            except Exception as fallback_e:
+                print(f"❌ CLI Agent: Fallback {self.display_name} also failed: {fallback_e}")
+                raise fallback_e
+        else:
+            raise last_error
 
     def get_model_name(self) -> str:
         """Get the current model short name (preserves vertex suffix for downstream routing)"""

@@ -34,6 +34,20 @@
         "display_name": "GPT-5.4",
         "reasoning_support": True,
         "json_mode": True
+    },
+    "gpt-5.5": {
+        "api_name": "gpt-5.5",
+        "vision": True,
+        "display_name": "GPT-5.5",
+        "reasoning_support": True,
+        "json_mode": True
+    },
+    "gpt-5.5-pro": {
+        "api_name": "gpt-5.5-pro",
+        "vision": True,
+        "display_name": "GPT-5.5 Pro",
+        "reasoning_support": True,
+        "json_mode": True
     }
 }