simular-ai · alckasoc · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -39,4 +39,4 @@ jobs:
 
     - name: Run Linter
       run: |
-        black --check gui_agents tests
+        black --check gui_agents
diff --git a/README.md b/README.md
@@ -3,6 +3,12 @@
   <small>Use Computer Like a Human</small>
 </h1>
 
+<p align="center">&nbsp;
+  🌐 <a href="https://www.simular.ai/articles/agent-s3">[S3 blog]</a>&nbsp;
+  📄 <a href="https://arxiv.org/abs/2510.02250">[S3 Paper]</a>&nbsp;
+  🎥 <a href="https://www.youtube.com/watch?v=VHr0a3UBsh4">[S3 Video]</a>
+</p>
+
 <p align="center">&nbsp;
   🌐 <a href="https://www.simular.ai/articles/agent-s2-technical-review">[S2 blog]</a>&nbsp;
   📄 <a href="https://arxiv.org/abs/2504.00906">[S2 Paper (COLM 2025)]</a>&nbsp;
@@ -50,6 +56,7 @@
 </div>
 
 ## 🥳 Updates
+- [x] **2025/10/02**: Released the [Agent S3 paper](https://arxiv.org/abs/2510.02250), setting a new SOTA of **69.9%** on OSWorld, with strong performance on WindowsAgentArena, and AndroidWorld!
 - [x] **2025/08/01**: Agent S2.5 is released (gui-agents v0.2.5): simpler, better, and faster! New SOTA on [OSWorld-Verified](https://os-world.github.io)!
 - [x] **2025/07/07**: The [Agent S2 paper](https://arxiv.org/abs/2504.00906) is accepted to COLM 2025! See you in Montreal!
 - [x] **2025/04/27**: The Agent S paper won the Best Paper Award 🏆 at ICLR 2025 Agentic AI for Science Workshop!
@@ -77,36 +84,13 @@ Whether you're interested in AI, automation, or contributing to cutting-edge age
 
 ## 🎯 Current Results
 
-<div align="center">
-  <table border="0" cellspacing="0" cellpadding="5">
-    <tr>
-      <th>Benchmark</th>
-      <th>Agent S2.5</th>
-      <th>Previous SOTA</th>
-    </tr>
-    <tr>
-      <td>OSWorld Verified (100 step)</td>
-      <td><b>56.0%</b></td>
-      <td>53.1%</td>
-    </tr>
-    <tr>
-      <td>OSWorld Verified (50 step)</td>
-      <td><b>54.2%</b></td>
-      <td>50.6%</td>
-    </tr>
-<!--     <tr>
-      <td>WindowsAgentArena</td>
-      <td>29.8%</td>
-      <td>19.5% (NAVI)</td>
-    </tr>
-    <tr>
-      <td>AndroidWorld</td>
-      <td>54.3%</td>
-      <td>46.8% (UI-TARS)</td>
-    </tr> -->
-  </table>
-</div>
+<p align="center">
+  <img src="images/s3_results.png" alt="Agent S3 Results" width="700"/>
+</p>
 
+On OSWorld, Agent S3 alone reaches 62.6% in the 100-step setting, already exceeding the previous state of the art of 61.4% (Claude Sonnet 4.5). With the addition of Behavior Best-of-N, performance climbs even higher to 69.9%, bringing computer-use agents to within just a few points of human-level accuracy (72%).
+
+Agent S3 also demonstrates strong zero-shot generalization. On WindowsAgentArena, accuracy rises from 50.2% using only Agent S3 to 56.6% by selecting from 3 rollouts. Similarly on AndroidWorld, performance improves from 68.1% to 71.6%
 
 ## 🛠️ Installation & Setup
 
@@ -117,11 +101,11 @@ Whether you're interested in AI, automation, or contributing to cutting-edge age
 
 
 ### Installation
-To install Agent S2.5 without cloning the repository, run
+To install Agent S3 without cloning the repository, run
 ```bash
 pip install gui-agents
 ```
-If you would like to test Agent S2.5 while making changes, clone the repository and install using
+If you would like to test Agent S3 while making changes, clone the repository and install using
 ```
 pip install -e .
 ```
@@ -157,7 +141,9 @@ For optimal performance, we recommend [UI-TARS-1.5-7B](https://huggingface.co/By
 
 ### CLI
 
-Run Agent S2.5 with the required parameters:
+Note, this is running Agent S3, our improved agent, without bBoN. 
+
+Run Agent S3 with the required parameters:
 
 ```bash
 agent_s \
@@ -196,12 +182,12 @@ The grounding width and height should match the output coordinate resolution of
 
 ### `gui_agents` SDK
 
-First, we import the necessary modules. `AgentS2_5` is the main agent class for Agent S2.5. `OSWorldACI` is our grounding agent that translates agent actions into executable python code.
+First, we import the necessary modules. `AgentS3` is the main agent class for Agent S3. `OSWorldACI` is our grounding agent that translates agent actions into executable python code.
 ```python
 import pyautogui
 import io
-from gui_agents.s2_5.agents.agent_s import AgentS2_5
-from gui_agents.s2_5.agents.grounding import OSWorldACI
+from gui_agents.s3.agents.agent_s import AgentS3
+from gui_agents.s3.agents.grounding import OSWorldACI
 
 # Load in your API keys.
 from dotenv import load_dotenv
@@ -243,7 +229,7 @@ engine_params_for_grounding = {
 }
 ```
 
-Then, we define our grounding agent and Agent S2.5.
+Then, we define our grounding agent and Agent S3.
 
 ```python
 grounding_agent = OSWorldACI(
@@ -254,7 +240,7 @@ grounding_agent = OSWorldACI(
     height=1080  # Optional: screen height
 )
 
-agent = AgentS2_5(
+agent = AgentS3(
     engine_params,
     grounding_agent,
     platform=current_platform,
@@ -282,11 +268,11 @@ info, action = agent.predict(instruction=instruction, observation=obs)
 exec(action[0])
 ```
 
-Refer to `gui_agents/s2_5/cli_app.py` for more details on how the inference loop works.
+Refer to `gui_agents/s3/cli_app.py` for more details on how the inference loop works.
 
 ### OSWorld
 
-To deploy Agent S2.5 in OSWorld, follow the [OSWorld Deployment instructions](osworld_setup/s2_5/OSWorld.md).
+To deploy Agent S3 in OSWorld, follow the [OSWorld Deployment instructions](osworld_setup/s3/OSWorld.md).
 
 ## 💬 Citations
 

diff --git a/gui_agents/s1/cli_app.py b/gui_agents/s1/cli_app.py
@@ -17,13 +17,15 @@
 # Global flag to track pause state for debugging
 paused = False
 
+
 def get_char():
     """Get a single character from stdin without pressing Enter"""
     try:
         # Import termios and tty on Unix-like systems
         if platform.system() in ["Darwin", "Linux"]:
             import termios
             import tty
+
             fd = sys.stdin.fileno()
             old_settings = termios.tcgetattr(fd)
             try:
@@ -35,29 +37,31 @@ def get_char():
         else:
             # Windows fallback
             import msvcrt
-            return msvcrt.getch().decode('utf-8', errors='ignore')
+
+            return msvcrt.getch().decode("utf-8", errors="ignore")
     except:
         return input()  # Fallback for non-terminal environments
 
+
 def signal_handler(signum, frame):
     """Handle Ctrl+C signal for debugging during agent execution"""
     global paused
-    
+
     if not paused:
         print("\n\n🔸 Agent-S Workflow Paused 🔸")
         print("=" * 50)
         print("Options:")
         print("  • Press Ctrl+C again to quit")
         print("  • Press Esc to resume workflow")
         print("=" * 50)
-        
+
         paused = True
-        
+
         while paused:
             try:
                 print("\n[PAUSED] Waiting for input... ", end="", flush=True)
                 char = get_char()
-                
+
                 if ord(char) == 3:  # Ctrl+C
                     print("\n\n🛑 Exiting Agent-S...")
                     sys.exit(0)
@@ -67,7 +71,7 @@ def signal_handler(signum, frame):
                     break
                 else:
                     print(f"\n   Unknown command: '{char}' (ord: {ord(char)})")
-                    
+
             except KeyboardInterrupt:
                 print("\n\n🛑 Exiting Agent-S...")
                 sys.exit(0)
@@ -76,6 +80,7 @@ def signal_handler(signum, frame):
         print("\n\n🛑 Exiting Agent-S...")
         sys.exit(0)
 
+
 # Set up signal handler for Ctrl+C
 signal.signal(signal.SIGINT, signal_handler)
 
@@ -175,7 +180,7 @@ def run_agent(agent: UIAgent, instruction: str):
             time.sleep(0.1)
 
         print(f"\n🔄 Step {step + 1}/15: Getting next action from agent...")
-        
+
         # Get next action code from the agent
         info, code = agent.predict(instruction=instruction, observation=obs)
 

diff --git a/gui_agents/s2/cli_app.py b/gui_agents/s2/cli_app.py
@@ -19,13 +19,15 @@
 # Global flag to track pause state for debugging
 paused = False
 
+
 def get_char():
     """Get a single character from stdin without pressing Enter"""
     try:
         # Import termios and tty on Unix-like systems
         if platform.system() in ["Darwin", "Linux"]:
             import termios
             import tty
+
             fd = sys.stdin.fileno()
             old_settings = termios.tcgetattr(fd)
             try:
@@ -37,29 +39,31 @@ def get_char():
         else:
             # Windows fallback
             import msvcrt
-            return msvcrt.getch().decode('utf-8', errors='ignore')
+
+            return msvcrt.getch().decode("utf-8", errors="ignore")
     except:
         return input()  # Fallback for non-terminal environments
 
+
 def signal_handler(signum, frame):
     """Handle Ctrl+C signal for debugging during agent execution"""
     global paused
-    
+
     if not paused:
         print("\n\n🔸 Agent-S Workflow Paused 🔸")
         print("=" * 50)
         print("Options:")
         print("  • Press Ctrl+C again to quit")
         print("  • Press Esc to resume workflow")
         print("=" * 50)
-        
+
         paused = True
-        
+
         while paused:
             try:
                 print("\n[PAUSED] Waiting for input... ", end="", flush=True)
                 char = get_char()
-                
+
                 if ord(char) == 3:  # Ctrl+C
                     print("\n\n🛑 Exiting Agent-S...")
                     sys.exit(0)
@@ -69,7 +73,7 @@ def signal_handler(signum, frame):
                     break
                 else:
                     print(f"\n   Unknown command: '{char}' (ord: {ord(char)})")
-                    
+
             except KeyboardInterrupt:
                 print("\n\n🛑 Exiting Agent-S...")
                 sys.exit(0)
@@ -78,6 +82,7 @@ def signal_handler(signum, frame):
         print("\n\n🛑 Exiting Agent-S...")
         sys.exit(0)
 
+
 # Set up signal handler for Ctrl+C
 signal.signal(signal.SIGINT, signal_handler)
 
@@ -155,7 +160,7 @@ def run_agent(agent, instruction: str, scaled_width: int, scaled_height: int):
         # Check if we're in paused state and wait
         while paused:
             time.sleep(0.1)
-            
+
         # Get screen shot using pyautogui
         screenshot = pyautogui.screenshot()
         screenshot = screenshot.resize((scaled_width, scaled_height), Image.LANCZOS)
@@ -174,7 +179,7 @@ def run_agent(agent, instruction: str, scaled_width: int, scaled_height: int):
             time.sleep(0.1)
 
         print(f"\n🔄 Step {step + 1}/15: Getting next action from agent...")
-        
+
         # Get next action code from the agent
         info, code = agent.predict(instruction=instruction, observation=obs)