From 89c5b368c840dc5fb4160139bbfb5bee26b6b348 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 8 Feb 2025 21:46:40 +0100
Subject: [PATCH 01/37] Update operate.py

Added wait function
---
 operate/operate.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/operate/operate.py b/operate/operate.py
index c63d9851..e70aa605 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -137,7 +137,7 @@ def operate(operations, model):
     for operation in operations:
         if config.verbose:
             print("[Self Operating Computer][operate] operation", operation)
-        # wait one second
+        # wait one second before processing each operation
         time.sleep(1)
         operate_type = operation.get("operation").lower()
         operate_thought = operation.get("thought")
@@ -158,17 +158,19 @@ def operate(operations, model):
             y = operation.get("y")
             click_detail = {"x": x, "y": y}
             operate_detail = click_detail
-
             operating_system.mouse(click_detail)
         elif operate_type == "done":
             summary = operation.get("summary")
-
             print(
                 f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]"
             )
             print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n")
             return True
-
+        elif operate_type == "wait" or operate_type == "none":
+            print(
+                f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BLUE} Waiting for 5 seconds...{ANSI_RESET}]"
+            )
+            time.sleep(5)
         else:
             print(
                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}"

From 1760d989ac220109de8c26546ee46c35d23eeb33 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sun, 9 Feb 2025 12:21:37 +0100
Subject: [PATCH 02/37] Update operate.py

I uploaded a functional code to wait if the screen isnt yet loaded and it works perfectly.
---
 operate/operate.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/operate/operate.py b/operate/operate.py
index e70aa605..b052301a 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -167,10 +167,12 @@ def operate(operations, model):
             print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n")
             return True
         elif operate_type == "wait" or operate_type == "none":
+            duration = operation.get("duration", 5)  # Default to 5 seconds if not specified
             print(
-                f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BLUE} Waiting for 5 seconds...{ANSI_RESET}]"
+                f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BLUE} Waiting for {duration} seconds...{ANSI_RESET}]"
             )
-            time.sleep(5)
+            time.sleep(duration)
+            operate_detail = f"waiting {duration}s"
         else:
             print(
                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}"

From bd294ad2df80c30e8e996a34e444fac34c6318de Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Fri, 28 Feb 2025 23:39:33 +0100
Subject: [PATCH 03/37] Update prompts.py

Added wait operation to the prompt so if the page isn't loaded yet it uses operate.py "wait" operation and Waits for 5 seconds.
---
 operate/models/prompts.py | 44 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 0a7e0ad1..14efbad0 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -13,7 +13,7 @@
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click
 ```
@@ -34,6 +34,10 @@
 ```
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
+5. wait - Wait some time for a page to load
+```
+[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}]
+```
 
 Return the actions in array format `[]`. You can take just one action or multiple actions.
 
@@ -57,6 +61,14 @@
 ]
 ```
 
+Example 3: Waits to the page to load before proceeding to interact
+```
+[
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }},
+    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
+]
+```
+
 A few important notes: 
 
 - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
@@ -71,7 +83,7 @@
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
 ```
@@ -90,6 +102,12 @@
 ```
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
+
+5. wait - Wait some time for a page to load
+```
+[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}]
+```
+
 Return the actions in array format `[]`. You can take just one action or multiple actions.
 
 Here a helpful example:
@@ -119,6 +137,14 @@
 ]
 ```
 
+Example 4: Waits to the page to load before proceeding to interact
+```
+[
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }},
+    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
+]
+```
+
 A few important notes: 
 
 - Go to Google Docs and Google Sheets by typing in the Chrome Address bar
@@ -134,7 +160,7 @@
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method.
 ```
@@ -153,6 +179,11 @@
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
 
+5. wait - Wait some time for a page to load
+```
+[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}]
+```
+
 Return the actions in array format `[]`. You can take just one action or multiple actions.
 
 Here a helpful example:
@@ -183,6 +214,13 @@
     {{ "thought": "Finally I'll submit the search form with enter", "operation": "press", "keys": ["enter"] }}
 ]
 ```
+Example 4: Waits to the page to load before proceeding to interact
+```
+[
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }},
+    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
+]
+```
 
 A few important notes: 
 

From 803cdc83dafd5af732f9400246b85246996e6d11 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Fri, 28 Feb 2025 23:54:42 +0100
Subject: [PATCH 04/37] Update prompts.py

Corrected and finished adding wait operation to prompt.
---
 operate/models/prompts.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 14efbad0..8c47788e 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -36,7 +36,7 @@
 ```
 5. wait - Wait some time for a page to load
 ```
-[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}]
+[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}]
 ```
 
 Return the actions in array format `[]`. You can take just one action or multiple actions.
@@ -64,7 +64,7 @@
 Example 3: Waits to the page to load before proceeding to interact
 ```
 [
-    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }},
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "waiting 5 seconds"}},
     {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
 ]
 ```
@@ -105,7 +105,7 @@
 
 5. wait - Wait some time for a page to load
 ```
-[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}]
+[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}]
 ```
 
 Return the actions in array format `[]`. You can take just one action or multiple actions.
@@ -140,7 +140,7 @@
 Example 4: Waits to the page to load before proceeding to interact
 ```
 [
-    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }},
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "Waiting 5 seconds" }},
     {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
 ]
 ```
@@ -181,7 +181,7 @@
 
 5. wait - Wait some time for a page to load
 ```
-[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}]
+[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}]
 ```
 
 Return the actions in array format `[]`. You can take just one action or multiple actions.
@@ -217,7 +217,7 @@
 Example 4: Waits to the page to load before proceeding to interact
 ```
 [
-    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }},
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "Waiting 5 seconds" }},
     {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
 ]
 ```
@@ -234,14 +234,14 @@
 """
 
 OPERATE_FIRST_MESSAGE_PROMPT = """
-Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done
+Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 5 operations available: click, write, press, done, wait
 
 You just started so you are in the terminal app and your code is running in this terminal tab. To leave the terminal, search for a new program on the OS. 
 
 Action:"""
 
 OPERATE_PROMPT = """
-Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done
+Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 5 operations available: click, write, press, done, wait
 Action:"""
 
 
From 8ac908d35725a367db39a5abb951f79a7299629e Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 00:24:13 +0100
Subject: [PATCH 05/37] Update prompts.py

Made the prompt more coherent
---
 operate/models/prompts.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 8c47788e..eab7ebdc 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -36,7 +36,7 @@
 ```
 5. wait - Wait some time for a page to load
 ```
-[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}]
+[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
 
 Return the actions in array format `[]`. You can take just one action or multiple actions.
@@ -64,8 +64,8 @@
 Example 3: Waits to the page to load before proceeding to interact
 ```
 [
-    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "waiting 5 seconds"}},
-    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5"}},
+    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}]
 ]
 ```
 
@@ -105,7 +105,7 @@
 
 5. wait - Wait some time for a page to load
 ```
-[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}]
+[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
 
 Return the actions in array format `[]`. You can take just one action or multiple actions.
@@ -140,8 +140,8 @@
 Example 4: Waits to the page to load before proceeding to interact
 ```
 [
-    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "Waiting 5 seconds" }},
-    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }},
+    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}]
 ]
 ```
 
@@ -181,7 +181,7 @@
 
 5. wait - Wait some time for a page to load
 ```
-[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}]
+[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
 
 Return the actions in array format `[]`. You can take just one action or multiple actions.
@@ -217,8 +217,8 @@
 Example 4: Waits to the page to load before proceeding to interact
 ```
 [
-    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "Waiting 5 seconds" }},
-    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
+    {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }},
+    {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}]
 ]
 ```
 

From 983288501da32387c7b8d23b6f1458c31847007d Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 09:29:51 +0100
Subject: [PATCH 06/37] Update README.md

new model (Claude 3.7) explanation
---
 README.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ab24691c..cb92f29f 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ ome
 
 ## Key Features
 - **Compatibility**: Designed for various multimodal models.
-- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3 and LLaVa.**
+- **Integration**: Currently integrated with **GPT-4o, o1, Claude 3.7, Gemini Pro Vision, Claude 3, qwuen-VL and LLaVa.**
 - **Future Plans**: Support for additional models.
 
 ## Demo
@@ -62,6 +62,14 @@ operate -m o1-with-ocr
 
 
 ### Multimodal Models  `-m`
+
+#### Try claude 3.7 `-m claude-3.7`
+Use Clude 3.7 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Antheopic dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. 
+
+```
+operate -m claude-3.7
+```
+
 Try Google's `gemini-pro-vision` by following the instructions below. Start `operate` with the Gemini model
 ```
 operate -m gemini-pro-vision
@@ -76,6 +84,13 @@ Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a
 operate -m claude-3
 ```
 
+#### Try qwen `-m qwen-vl`
+Use Qwen-vl with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Qwen dashboard](https://bailian.console.aliyun.com/) to get an API key and run the command below to try it. 
+
+```
+operate -m qwen-vl
+```
+
 #### Try LLaVa Hosted Through Ollama `-m llava`
 If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama!   
 *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview*   

From 64929e238b8d890473a699492fead4dd243a7c3a Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 09:37:53 +0100
Subject: [PATCH 07/37] Update config.py

implementing Claude 3.7 and Qwen-VL API KEY request
---
 operate/config.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/operate/config.py b/operate/config.py
index b97b20ac..ca8f16d6 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -44,6 +44,10 @@ def __init__(self):
             None  # instance variables are backups in case saving to a `.env` fails
         )
 
+        self.qwen_api_key = (
+            None  # instance variables are backups in case saving to a `.env` fails
+        )
+
     def initialize_openai(self):
         if self.verbose:
             print("[Config][initialize_openai]")
@@ -66,6 +70,29 @@ def initialize_openai(self):
         client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
         return client
 
+    def initialize_qwen(self):
+        if self.verbose:
+            print("[Config][initialize_qwen]")
+
+        if self.qwen_api_key:
+            if self.verbose:
+                print("[Config][initialize_qwen] using cached qwen_api_key")
+            api_key = self.qwen_api_key
+        else:
+            if self.verbose:
+                print(
+                    "[Config][initialize_qwen] no cached qwen_api_key, try to get from env."
+                )
+            api_key = os.getenv("QWEN_API_KEY")
+
+        client = OpenAI(
+            api_key=api_key,
+            base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+        )
+        client.api_key = api_key
+        client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+        return client
+
     def initialize_google(self):
         if self.google_api_key:
             if self.verbose:
@@ -119,8 +146,9 @@ def validation(self, model, voice_mode):
             "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
         )
         self.require_api_key(
-            "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3"
+            "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" or model == "claude-3.7"
         )
+        self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
 
     def require_api_key(self, key_name, key_description, is_required):
         key_exists = bool(os.environ.get(key_name))
@@ -147,6 +175,8 @@ def prompt_and_save_api_key(self, key_name, key_description):
                 self.google_api_key = key_value
             elif key_name == "ANTHROPIC_API_KEY":
                 self.anthropic_api_key = key_value
+            elif key_name == "QWEN_API_KEY":
+                self.qwen_api_key = key_value
             self.save_api_key_to_env(key_name, key_value)
             load_dotenv()  # Reload environment variables
             # Update the instance attribute with the new key

From e9fdb3a6756011aa7b8362023e61ed9276ded44b Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 09:58:10 +0100
Subject: [PATCH 08/37] Update config.py

few adjustments
---
 operate/config.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/operate/config.py b/operate/config.py
index ca8f16d6..c3e40060 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -124,8 +124,14 @@ def initialize_ollama(self):
 
     def initialize_anthropic(self):
         if self.anthropic_api_key:
+            if self.verbose:
+                print("[Config][initialize_anthropic] using cached anthropic_api_key")
             api_key = self.anthropic_api_key
         else:
+             if self.verbose:
+                print(
+                    "[Config][initialize_anthropic] no cached google_api_key, try to get from env."
+                )
             api_key = os.getenv("ANTHROPIC_API_KEY")
         return anthropic.Anthropic(api_key=api_key)
 

From 2d1c6ad9b3eb4692643935bfb184aedb58fd1270 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:07:58 +0100
Subject: [PATCH 09/37] Update apis.py

added Claude 3.7 and Qwen-VL call functions
---
 operate/models/apis.py | 228 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index d0ccb0c4..65c2ab16 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -25,6 +25,7 @@
 )
 from operate.utils.ocr import get_text_coordinates, get_text_element
 from operate.utils.screenshot import capture_screen_with_cursor
+from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot
 from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET
 
 # Load configuration
@@ -37,6 +38,11 @@ async def get_next_action(model, messages, objective, session_id):
         print("[Self-Operating Computer][get_next_action] model", model)
     if model == "gpt-4":
         return call_gpt_4o(messages), None
+    if model == "Claude-3.7":
+        return call_claude_3_7(messages), None
+    if model == "qwen-vl":
+        operation = await call_qwen_vl_with_ocr(messages, objective, model)
+        return operation, None
     if model == "gpt-4-with-som":
         operation = await call_gpt_4o_labeled(messages, objective, model)
         return operation, None
@@ -135,6 +141,228 @@ def call_gpt_4o(messages):
             traceback.print_exc()
         return call_gpt_4o(messages)
 
+def call_claude_37(messages):
+    if config.verbose:
+        print("[call_claude_37]")
+    time.sleep(1)
+    
+    # We'll need to import Anthropic's client library
+    import anthropic
+    
+    try:
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        # Call the function to capture the screen with the cursor
+        capture_screen_with_cursor(screenshot_filename)
+        
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+        
+        # Determine which prompt to use
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+            
+        if config.verbose:
+            print(
+                "[call_claude_37] user_prompt",
+                user_prompt,
+            )
+        
+        # Initialize Anthropic client
+        # You'll need to configure this in your config module
+        client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
+        
+        # Convert previous messages to Anthropic format if needed
+        anthropic_messages = []
+        for msg in messages[:-1]:  # Skip the last message as we'll handle it specially
+            if msg["role"] == "system":
+                # System messages are handled differently in Anthropic API
+                system_content = msg["content"]
+            else:
+                anthropic_messages.append({
+                    "role": msg["role"],
+                    "content": msg["content"]
+                })
+        
+        # Create vision message for Claude
+        # Claude uses a different format for media than OpenAI
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": img_base64
+                    }
+                }
+            ]
+        }
+        
+        # Add the vision message to our anthropic messages
+        anthropic_messages.append(vision_message)
+        
+        # Create the message request
+        response = client.messages.create(
+            model="claude-3-7-sonnet-20250219",  # Claude 3.7 Sonnet model ID
+            messages=anthropic_messages,
+            system=system_content if 'system_content' in locals() else None,
+            max_tokens=2048,
+        )
+        
+        # Extract the content from the response
+        content = response.content[0].text
+        content = clean_json(content)
+        
+        # Create assistant message
+        assistant_message = {"role": "assistant", "content": content}
+        
+        if config.verbose:
+            print(
+                "[call_claude_37] content",
+                content,
+            )
+        
+        content = json.loads(content)
+        messages.append(assistant_message)
+        return content
+        
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
+            e,
+        )
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
+            content if 'content' in locals() else "No content received",
+        )
+        if config.verbose:
+            traceback.print_exc()
+        return call_claude_37(messages)
+
+async def call_qwen_vl_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_qwen_vl_with_ocr]")
+
+    # Construct the path to the file within the package
+    try:
+        time.sleep(1)
+        client = config.initialize_qwen()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        # Call the function to capture the screen with the cursor
+        raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png")
+        capture_screen_with_cursor(raw_screenshot_filename)
+
+        # Compress screenshot image to make size be smaller
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.jpeg")
+        compress_screenshot(raw_screenshot_filename, screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text",
+                 "text": f"{user_prompt}**REMEMBER** Only output json format, do not append any other text."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
+                },
+            ],
+        }
+        messages.append(vision_message)
+
+        response = client.chat.completions.create(
+            model="qwen2.5-vl-72b-instruct",
+            messages=messages,
+        )
+
+        content = response.choices[0].message.content
+
+        content = clean_json(content)
+
+        # used later for the messages
+        content_str = content
+
+        content = json.loads(content)
+
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if config.verbose:
+                    print(
+                        "[call_qwen_vl_with_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(["en"])
+
+                # Read the screenshot
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(
+                    result, text_to_click, screenshot_filename
+                )
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                # add `coordinates`` to `content`
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_qwen_vl_with_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_qwen_vl_with_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_qwen_vl_with_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+        # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history
+        assistant_message = {"role": "assistant", "content": content_str}
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
+        )
+        if config.verbose:
+            print("[Self-Operating Computer][Operate] error", e)
+            traceback.print_exc()
+        return gpt_4_fallback(messages, objective, model)
+
 
 def call_gemini_pro_vision(messages, objective):
     """

From f1101af0adf63e3abbf8b48d410832a0f374d695 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:12:00 +0100
Subject: [PATCH 10/37] Update prompts.py

No need to make changes for Claude 3.7 model Prompt selection but instead implemented Qwen-VL model Prompt selection
---
 operate/models/prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index eab7ebdc..f8ddad18 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -270,7 +270,7 @@ def get_system_prompt(model, objective):
             os_search_str=os_search_str,
             operating_system=operating_system,
         )
-    elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3":
+    elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
 
         prompt = SYSTEM_PROMPT_OCR.format(
             objective=objective,

From 4a6744b5d0318e1ba5d16b8fdb6372d6fb9aa18f Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:15:00 +0100
Subject: [PATCH 11/37] Update screenshot.py

no need to make changes for Claude 3.7 screenshot function but added compressed screenshot function for Qwen-VL instead
---
 operate/utils/screenshot.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py
index 597911ad..23d492f1 100644
--- a/operate/utils/screenshot.py
+++ b/operate/utils/screenshot.py
@@ -25,3 +25,18 @@ def capture_screen_with_cursor(file_path):
         subprocess.run(["screencapture", "-C", file_path])
     else:
         print(f"The platform you're using ({user_platform}) is not currently supported")
+
+
+def compress_screenshot(raw_screenshot_filename, screenshot_filename):
+    with Image.open(raw_screenshot_filename) as img:
+        # Check if the image has an alpha channel (transparency)
+        if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info):
+            # Create a white background image
+            background = Image.new('RGB', img.size, (255, 255, 255))
+            # Paste the image onto the background, using the alpha channel as mask
+            background.paste(img, mask=img.split()[3])  # 3 is the alpha channel
+            # Save the result as JPEG
+            background.save(screenshot_filename, 'JPEG', quality=85)  # Adjust quality as needed
+        else:
+            # If no alpha channel, simply convert and save
+            img.convert('RGB').save(screenshot_filename, 'JPEG', quality=85)

From 980e4ee09db32d9dc9c7c31e090595b63afccce5 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:20:29 +0100
Subject: [PATCH 12/37] Update setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index dbb2cf18..5c9e7013 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="self-operating-computer",
-    version="1.5.7",
+    version="1.5.9",
     packages=find_packages(),
     install_requires=required,  # Add dependencies here
     entry_points={

From cdb67afe4ee2b0e9cac718cad938903a9a1559e4 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:32:44 +0100
Subject: [PATCH 13/37] Update config.py

intendation correction
---
 operate/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/operate/config.py b/operate/config.py
index c3e40060..a02129d5 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -133,7 +133,8 @@ def initialize_anthropic(self):
                     "[Config][initialize_anthropic] no cached google_api_key, try to get from env."
                 )
             api_key = os.getenv("ANTHROPIC_API_KEY")
-        return anthropic.Anthropic(api_key=api_key)
+            
+            return anthropic.Anthropic(api_key=api_key)
 
     def validation(self, model, voice_mode):
         """

From eac3aeec7ffeae75b3b1d5f0211f3c3d7ce982cf Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:37:01 +0100
Subject: [PATCH 14/37] Update config.py

intendation fix
---
 operate/config.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/operate/config.py b/operate/config.py
index a02129d5..985f3390 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -128,13 +128,13 @@ def initialize_anthropic(self):
                 print("[Config][initialize_anthropic] using cached anthropic_api_key")
             api_key = self.anthropic_api_key
         else:
-             if self.verbose:
+            if self.verbose:
                 print(
-                    "[Config][initialize_anthropic] no cached google_api_key, try to get from env."
+                    "[Config][initialize_anthropic] no cached anthropic_api_key, try to get from env."
                 )
             api_key = os.getenv("ANTHROPIC_API_KEY")
-            
-            return anthropic.Anthropic(api_key=api_key)
+    
+        return anthropic.Anthropic(api_key=api_key)
 
     def validation(self, model, voice_mode):
         """

From 24ef0d9ec4710adf16777dc3f1cb57d5133c6ca7 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:46:14 +0100
Subject: [PATCH 15/37] Update config.py

---
 operate/config.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/operate/config.py b/operate/config.py
index 985f3390..bba6b155 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -153,9 +153,10 @@ def validation(self, model, voice_mode):
             "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision"
         )
         self.require_api_key(
-            "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" or model == "claude-3.7"
-        )
-        self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
+        "ANTHROPIC_API_KEY", "Anthropic API key", 
+        model == "claude-3" or model == "claude-3-7-sonnet-20250219"
+    )
+    self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
 
     def require_api_key(self, key_name, key_description, is_required):
         key_exists = bool(os.environ.get(key_name))

From 6277aa65e8b1df2936888bdc00231a4ae1769f36 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:52:54 +0100
Subject: [PATCH 16/37] Update config.py

---
 operate/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operate/config.py b/operate/config.py
index bba6b155..d06be2d9 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -154,7 +154,7 @@ def validation(self, model, voice_mode):
         )
         self.require_api_key(
         "ANTHROPIC_API_KEY", "Anthropic API key", 
-        model == "claude-3" or model == "claude-3-7-sonnet-20250219"
+        model == "claude-3" or model == "claude-3.7"
     )
     self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
 

From 5c09de0d3f4bb5ac079146866079201b05e7adc3 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 10:55:06 +0100
Subject: [PATCH 17/37] Update apis.py

added coherence to the code
---
 operate/models/apis.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 65c2ab16..7e2706b8 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -38,8 +38,8 @@ async def get_next_action(model, messages, objective, session_id):
         print("[Self-Operating Computer][get_next_action] model", model)
     if model == "gpt-4":
         return call_gpt_4o(messages), None
-    if model == "Claude-3.7":
-        return call_claude_3_7(messages), None
+    if model == "claude-3.7":
+        return call_claude_37(messages), None
     if model == "qwen-vl":
         operation = await call_qwen_vl_with_ocr(messages, objective, model)
         return operation, None

From 2aa8aa464ac8d21fb990094c631e0cb64705e4b8 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 11:02:58 +0100
Subject: [PATCH 18/37] Update apis.py

---
 operate/models/apis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 7e2706b8..a95ecc29 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -174,7 +174,7 @@ def call_claude_37(messages):
         
         # Initialize Anthropic client
         # You'll need to configure this in your config module
-        client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
+        client = anthropic.Anthropic(api_key=config.anthropic_api_key if config.anthropic_api_key else os.getenv("ANTHROPIC_API_KEY"))
         
         # Convert previous messages to Anthropic format if needed
         anthropic_messages = []

From 9b23a09be2694c88d821bc5500caf9ff4e77669c Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 11:07:19 +0100
Subject: [PATCH 19/37] Update apis.py

---
 operate/models/apis.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index a95ecc29..f8cda256 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -173,9 +173,7 @@ def call_claude_37(messages):
             )
         
         # Initialize Anthropic client
-        # You'll need to configure this in your config module
-        client = anthropic.Anthropic(api_key=config.anthropic_api_key if config.anthropic_api_key else os.getenv("ANTHROPIC_API_KEY"))
-        
+        client = config.initialize_anthropic()
         # Convert previous messages to Anthropic format if needed
         anthropic_messages = []
         for msg in messages[:-1]:  # Skip the last message as we'll handle it specially

From 20fdacedf30fe9724395789165546567ca29273c Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 11:12:34 +0100
Subject: [PATCH 20/37] Update config.py

---
 operate/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/operate/config.py b/operate/config.py
index d06be2d9..6d72cb13 100644
--- a/operate/config.py
+++ b/operate/config.py
@@ -155,8 +155,8 @@ def validation(self, model, voice_mode):
         self.require_api_key(
         "ANTHROPIC_API_KEY", "Anthropic API key", 
         model == "claude-3" or model == "claude-3.7"
-    )
-    self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
+        )
+        self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl")
 
     def require_api_key(self, key_name, key_description, is_required):
         key_exists = bool(os.environ.get(key_name))

From e703cb0145d7588809231783dd9c47d849759282 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 11:34:31 +0100
Subject: [PATCH 21/37] Update operate.py

change sting to integer in wait operation seconds
---
 operate/operate.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/operate/operate.py b/operate/operate.py
index b052301a..02c96e9f 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -166,13 +166,12 @@ def operate(operations, model):
             )
             print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n")
             return True
-        elif operate_type == "wait" or operate_type == "none":
-            duration = operation.get("duration", 5)  # Default to 5 seconds if not specified
-            print(
-                f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BLUE} Waiting for {duration} seconds...{ANSI_RESET}]"
-            )
+        elif operate_type == "wait":
+            duration = operation["duration"]
+            if isinstance(duration, str):
+                duration = float(duration)  # Convert string to float/integer
+            print(f"{ANSI_GREEN}[Self-Operating Computer | Waiting for {duration} seconds...]{ANSI_RESET}")
             time.sleep(duration)
-            operate_detail = f"waiting {duration}s"
         else:
             print(
                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}"

From 34c7e21260e38ee1aef1aac729a929b47c76cfc8 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 11:36:34 +0100
Subject: [PATCH 22/37] Update apis.py

changed call_claude_37 function to correctly handle requests
---
 operate/models/apis.py | 112 +++++++++++++++++++++++++++--------------
 1 file changed, 73 insertions(+), 39 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index f8cda256..22afd718 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -141,53 +141,76 @@ def call_gpt_4o(messages):
             traceback.print_exc()
         return call_gpt_4o(messages)
 
+
 def call_claude_37(messages):
     if config.verbose:
         print("[call_claude_37]")
     time.sleep(1)
-    
-    # We'll need to import Anthropic's client library
+
+    # Import the anthropic module inside the function to ensure it's available
     import anthropic
-    
+
     try:
         screenshots_dir = "screenshots"
         if not os.path.exists(screenshots_dir):
             os.makedirs(screenshots_dir)
         screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+
         # Call the function to capture the screen with the cursor
         capture_screen_with_cursor(screenshot_filename)
-        
-        with open(screenshot_filename, "rb") as img_file:
+
+        # Convert PNG to JPEG format to ensure compatibility
+        img = Image.open(screenshot_filename)
+        if img.mode in ('RGBA', 'LA'):
+            # Remove alpha channel for JPEG compatibility
+            background = Image.new("RGB", img.size, (255, 255, 255))
+            background.paste(img, mask=img.split()[3])  # 3 is the alpha channel
+            img = background
+
+        # Save as JPEG
+        jpeg_filename = os.path.join(screenshots_dir, "screenshot.jpg")
+        img.save(jpeg_filename, "JPEG", quality=95)
+
+        with open(jpeg_filename, "rb") as img_file:
             img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
-        
+
         # Determine which prompt to use
         if len(messages) == 1:
             user_prompt = get_user_first_message_prompt()
         else:
             user_prompt = get_user_prompt()
-            
+
         if config.verbose:
-            print(
-                "[call_claude_37] user_prompt",
-                user_prompt,
-            )
-        
-        # Initialize Anthropic client
-        client = config.initialize_anthropic()
-        # Convert previous messages to Anthropic format if needed
+            print("[call_claude_37] user_prompt", user_prompt)
+
+        # Initialize Anthropic client directly with the environment variable
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            api_key = config.anthropic_api_key  # Fallback to instance variable
+
+        if config.verbose:
+            print("[call_claude_37] Using Anthropic API key (masked):", "*" * len(api_key) if api_key else "None")
+
+        client = anthropic.Anthropic(api_key=api_key)
+
+        # Extract system message
+        system_content = None
+        if messages and messages[0]["role"] == "system":
+            system_content = messages[0]["content"]
+            user_messages = messages[1:-1] if len(messages) > 1 else []  # Skip system message and last message
+        else:
+            user_messages = messages[:-1] if messages else []  # No system message, include all but last
+
+        # Convert previous messages to Anthropic format
         anthropic_messages = []
-        for msg in messages[:-1]:  # Skip the last message as we'll handle it specially
-            if msg["role"] == "system":
-                # System messages are handled differently in Anthropic API
-                system_content = msg["content"]
-            else:
+        for msg in user_messages:
+            if msg["role"] in ["user", "assistant"]:  # Only include user and assistant messages
                 anthropic_messages.append({
                     "role": msg["role"],
                     "content": msg["content"]
                 })
-        
+
         # Create vision message for Claude
-        # Claude uses a different format for media than OpenAI
         vision_message = {
             "role": "user",
             "content": [
@@ -202,47 +225,58 @@ def call_claude_37(messages):
                 }
             ]
         }
-        
-        # Add the vision message to our anthropic messages
+
+        # Add the vision message
         anthropic_messages.append(vision_message)
-        
+
+        if config.verbose:
+            print("[call_claude_37] System content length:", len(system_content) if system_content else 0)
+            print("[call_claude_37] Number of messages:", len(anthropic_messages))
+
         # Create the message request
         response = client.messages.create(
-            model="claude-3-7-sonnet-20250219",  # Claude 3.7 Sonnet model ID
+            model="claude-3-7-sonnet-20250219",
             messages=anthropic_messages,
-            system=system_content if 'system_content' in locals() else None,
+            system=system_content,
             max_tokens=2048,
         )
-        
+
         # Extract the content from the response
         content = response.content[0].text
         content = clean_json(content)
-        
+
         # Create assistant message
         assistant_message = {"role": "assistant", "content": content}
-        
+
         if config.verbose:
-            print(
-                "[call_claude_37] content",
-                content,
-            )
-        
+            print("[call_claude_37] content", content)
+
         content = json.loads(content)
         messages.append(assistant_message)
         return content
-        
+
     except Exception as e:
+        error_msg = str(e)
         print(
             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}",
-            e,
+            error_msg,
         )
+
+        # Define content_str before using it to avoid the "referenced before assignment" error
+        content_str = "No content received"
+        if 'content' in locals():
+            content_str = content
+
         print(
             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}",
-            content if 'content' in locals() else "No content received",
+            content_str,
         )
+
         if config.verbose:
             traceback.print_exc()
-        return call_claude_37(messages)
+
+        # Fall back to GPT-4o
+        return call_gpt_4o(messages)
 
 async def call_qwen_vl_with_ocr(messages, objective, model):
     if config.verbose:

From 969cd07b2f3c95fd3e1cb607b550bfd23b8d5545 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 17:13:25 +0100
Subject: [PATCH 23/37] Update operate.py

added a scaling factor multiplier and divider, added a double click operation, left a pair of ideas behind but commented them in case somebody needs them.
---
 operate/operate.py | 431 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 377 insertions(+), 54 deletions(-)

diff --git a/operate/operate.py b/operate/operate.py
index 02c96e9f..32717a36 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -2,6 +2,7 @@
 import os
 import time
 import asyncio
+import pyautogui
 from prompt_toolkit.shortcuts import message_dialog
 from prompt_toolkit import prompt
 from operate.exceptions import ModelNotRecognizedException
@@ -112,7 +113,7 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
                 get_next_action(model, messages, objective, session_id)
             )
 
-            stop = operate(operations, model)
+            stop = operate(operations, session_id, model)
             if stop:
                 break
 
@@ -131,60 +132,382 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
             break
 
 
-def operate(operations, model):
-    if config.verbose:
-        print("[Self Operating Computer][operate]")
-    for operation in operations:
-        if config.verbose:
-            print("[Self Operating Computer][operate] operation", operation)
-        # wait one second before processing each operation
-        time.sleep(1)
-        operate_type = operation.get("operation").lower()
-        operate_thought = operation.get("thought")
-        operate_detail = ""
-        if config.verbose:
-            print("[Self Operating Computer][operate] operate_type", operate_type)
-
-        if operate_type == "press" or operate_type == "hotkey":
-            keys = operation.get("keys")
-            operate_detail = keys
-            operating_system.press(keys)
-        elif operate_type == "write":
-            content = operation.get("content")
-            operate_detail = content
-            operating_system.write(content)
-        elif operate_type == "click":
-            x = operation.get("x")
-            y = operation.get("y")
-            click_detail = {"x": x, "y": y}
-            operate_detail = click_detail
-            operating_system.mouse(click_detail)
-        elif operate_type == "done":
-            summary = operation.get("summary")
-            print(
-                f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]"
+# def verify_click_target(x_percent, y_percent, target_description, client):
+#     import pyautogui
+#     import base64
+#     import io
+#     from PIL import Image, ImageDraw
+#
+#     screen_width, screen_height = pyautogui.size()
+#     x = int(float(x_percent) * screen_width)
+#     y = int(float(y_percent) * screen_height)
+#
+#     region_size = 100
+#     region_left = max(0, x - region_size)
+#     region_top = max(0, y - region_size)
+#     region_width = min(region_size * 2, screen_width - region_left)
+#     region_height = min(region_size * 2, screen_height - region_top)
+#
+#     region_screenshot = pyautogui.screenshot(region=(region_left, region_top, region_width, region_height))
+#
+#     draw = ImageDraw.Draw(region_screenshot)
+#     center_x = x - region_left
+#     center_y = y - region_top
+#     line_length = 20
+#     draw.line((center_x - line_length, center_y, center_x + line_length, center_y), fill='red', width=2)
+#     draw.line((center_x, center_y - line_length, center_x, center_y + line_length), fill='red', width=2)
+#
+#     buffer = io.BytesIO()
+#     region_screenshot.save(buffer, format="JPEG")
+#     img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+#
+#     try:
+#         verification_prompt = f"""
+#         I'm about to click at the position marked with the red crosshair.
+#         I'm trying to click on: "{target_description}"
+#
+#         Does the crosshair appear to be positioned correctly on or very near the target?
+#         Respond ONLY with "YES" if it's correct or "NO" if it's wrong.
+#         """
+#
+#         response = client.messages.create(
+#             model="claude-3-7-sonnet-20250219",
+#             messages=[{
+#                 "role": "user",
+#                 "content": [
+#                     {"type": "text", "text": verification_prompt},
+#                     {
+#                         "type": "image",
+#                         "source": {
+#                             "type": "base64",
+#                             "media_type": "image/jpeg",
+#                             "data": img_base64
+#                         }
+#                     }
+#                 ]
+#             }],
+#             max_tokens=50,
+#         )
+#
+#         verification_result = response.content[0].text.strip().upper()
+#
+#         print(f"[Click Verification] Target: {target_description}")
+#         print(f"[Click Verification] Claude's response: {verification_result}")
+#
+#         region_screenshot.save("debug_last_click_verification.jpg")
+#
+#         return "YES" in verification_result
+#
+#     except Exception as e:
+#         print(f"[Click Verification] Error during verification: {e}")
+#         return False
+
+
+import cv2
+import numpy as np
+import pyautogui
+import os
+import io
+from PIL import Image, ImageDraw
+
+
+def find_icon_on_screen(target_description):
+    """
+    Uses computer vision to find an icon or UI element that matches the target description.
+
+    Args:
+        target_description (str): Description of what we're trying to find (e.g., "sbc-images-main folder")
+
+    Returns:
+        tuple: (x_percent, y_percent) coordinates as percentages of screen width/height, or None if not found
+    """
+    # Take a screenshot of the entire screen
+    screenshot = pyautogui.screenshot()
+    screenshot_np = np.array(screenshot)
+    screenshot_rgb = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR)
+
+    # Save the screenshot for debugging
+    cv2.imwrite("debug_full_screen.jpg", screenshot_rgb)
+
+    # Initialize results
+    results = []
+
+    # 1. Text detection for folder/file names (optional, requires pytesseract)
+    try:
+        import pytesseract
+        gray = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2GRAY)
+
+        # Extract text from the screenshot
+        text_data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)
+
+        # Look for the target text in detected text
+        target_words = target_description.lower().split()
+
+        for i, text in enumerate(text_data['text']):
+            if text and any(word in text.lower() for word in target_words):
+                # Get coordinates for this text
+                x = text_data['left'][i] + text_data['width'][i] // 2
+                y = text_data['top'][i] + text_data['height'][i] // 2
+
+                # Add to results with high confidence
+                results.append((x, y, 0.9))  # 0.9 is confidence score
+
+                # Draw a rectangle around the text for debugging
+                x1, y1 = text_data['left'][i], text_data['top'][i]
+                x2 = x1 + text_data['width'][i]
+                y2 = y1 + text_data['height'][i]
+                cv2.rectangle(screenshot_rgb, (x1, y1), (x2, y2), (0, 255, 0), 2)
+    except (ImportError, Exception) as e:
+        print(f"Text detection not available: {e}")
+
+    # 2. Template matching for common desktop icons
+    icon_folder = "icon_templates"
+    if os.path.exists(icon_folder):
+        for filename in os.listdir(icon_folder):
+            if filename.endswith(('.png', '.jpg')):
+                template_path = os.path.join(icon_folder, filename)
+                template = cv2.imread(template_path)
+
+                if template is None:
+                    continue
+
+                # Apply template matching
+                template_h, template_w = template.shape[:2]
+                res = cv2.matchTemplate(screenshot_rgb, template, cv2.TM_CCOEFF_NORMED)
+
+                # Get locations where the match exceeds threshold
+                threshold = 0.7
+                loc = np.where(res >= threshold)
+
+                for pt in zip(*loc[::-1]):
+                    # Get center point of the match
+                    x = pt[0] + template_w // 2
+                    y = pt[1] + template_h // 2
+                    confidence = res[pt[1], pt[0]]
+
+                    # Add to results
+                    results.append((x, y, confidence))
+
+                    # Draw for debugging
+                    cv2.rectangle(screenshot_rgb, pt, (pt[0] + template_w, pt[1] + template_h), (0, 0, 255), 2)
+
+    # 3. Folder icon detection using color and shape (backup method)
+    if not results:
+        # Convert to HSV for better color segmentation
+        hsv = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2HSV)
+
+        # Define color ranges for common folder icons (yellow folders in Windows)
+        lower_yellow = np.array([20, 100, 100])
+        upper_yellow = np.array([40, 255, 255])
+
+        # Create mask for yellow color
+        mask = cv2.inRange(hsv, lower_yellow, upper_yellow)
+
+        # Find contours in the mask
+        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+        # Filter contours by size (folder icons are usually of similar size)
+        min_area = 100
+        max_area = 5000
+
+        for contour in contours:
+            area = cv2.contourArea(contour)
+            if min_area < area < max_area:
+                # Get center of contour
+                M = cv2.moments(contour)
+                if M["m00"] > 0:
+                    x = int(M["m10"] / M["m00"])
+                    y = int(M["m01"] / M["m00"])
+
+                    # Add to results with lower confidence
+                    results.append((x, y, 0.5))
+
+                    # Draw for debugging
+                    cv2.drawContours(screenshot_rgb, [contour], -1, (255, 0, 0), 2)
+
+    # Save the annotated screenshot for debugging
+    cv2.imwrite("debug_target_detection.jpg", screenshot_rgb)
+
+    if results:
+        # Sort by confidence
+        results.sort(key=lambda x: x[2], reverse=True)
+        best_match = results[0]
+
+        # Convert to percentage of screen size
+        screen_width, screen_height = screenshot.size
+        x_percent = best_match[0] / screen_width
+        y_percent = best_match[1] / screen_height
+
+        return (x_percent, y_percent)
+
+    return None
+
+
+# def enhanced_click(target_description, model=None):
+#     """
+#     Enhanced clicking function that uses computer vision to find and click on targets.
+#
+#     Args:
+#         target_description (str): Description of what to click on
+#         model (str, optional): Model name for verification
+#
+#     Returns:
+#         bool: True if click was successful, False otherwise
+#     """
+#     # Try to find the target using computer vision
+#     coords = find_icon_on_screen(target_description)
+#
+#     if coords:
+#         x_percent, y_percent = coords
+#         print(f"[Visual Target Finder] Found target '{target_description}' at ({x_percent:.3f}, {y_percent:.3f})")
+#
+#         # Convert percentages to actual screen coordinates
+#         screen_width, screen_height = pyautogui.size()
+#         x_coord = int(x_percent * screen_width)
+#         y_coord = int(y_percent * screen_height)
+#
+#         # Click on the found location
+#         pyautogui.click(x_coord, y_coord)
+#         return True
+#     else:
+#         print(f"[Visual Target Finder] Could not find target '{target_description}' on screen")
+#         return False
+
+
+import pyautogui
+import platform
+import ctypes
+import subprocess
+
+
+def get_scaling_factor():
+    """
+    Detect the current DPI scaling factor based on the operating system.
+    Returns:
+        scaling_factor (float): A multiplier to adjust coordinates.
+    """
+    os_name = platform.system()
+    scaling_factor = 1.0
+
+    if os_name == "Windows":
+        try:
+            user32 = ctypes.windll.user32
+            user32.SetProcessDPIAware()
+            dc = user32.GetDC(0)
+            logical_width = user32.GetDeviceCaps(dc, 8)  # HORZRES (logical width)
+            physical_width = user32.GetDeviceCaps(dc, 118)  # DESKTOPHORZRES (physical width)
+            scaling_factor = physical_width / logical_width
+            user32.ReleaseDC(0, dc)
+        except Exception as e:
+            print("Windows scaling detection error:", e)
+            scaling_factor = 1.0
+    elif os_name == "Darwin":  # macOS
+        try:
+            output = subprocess.check_output(["system_profiler", "SPDisplaysDataType"])
+            output = output.decode("utf-8")
+            if "Retina" in output:
+                scaling_factor = 2.0
+            else:
+                scaling_factor = 1.0
+        except Exception as e:
+            print("macOS scaling detection error:", e)
+            scaling_factor = 1.0
+    elif os_name == "Linux":
+        try:
+            output = subprocess.check_output(
+                ["gsettings", "get", "org.gnome.desktop.interface", "scaling-factor"]
             )
-            print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n")
-            return True
-        elif operate_type == "wait":
-            duration = operation["duration"]
-            if isinstance(duration, str):
-                duration = float(duration)  # Convert string to float/integer
-            print(f"{ANSI_GREEN}[Self-Operating Computer | Waiting for {duration} seconds...]{ANSI_RESET}")
+            scaling_factor = float(output.decode("utf-8").strip())
+        except Exception as e:
+            print("Linux scaling detection error:", e)
+            scaling_factor = 1.0
+
+    return scaling_factor
+
+
+def click_relative(x_percent, y_percent, x_divisor=1.50, y_multiplier=1.25):
+    """
+    Converts relative coordinates to absolute screen coordinates, applies DPI scaling,
+    then divides the x-coordinate by x_divisor and multiplies the y-coordinate by y_multiplier before clicking.
+
+    Args:
+        x_percent (float): Relative x-coordinate (e.g., 0.10 for 10% across).
+        y_percent (float): Relative y-coordinate (e.g., 0.20 for 20% down).
+        x_divisor (float): Value to divide the computed x-coordinate by (default 1.50).
+        y_multiplier (float): Value to multiply the computed y-coordinate by (default 1.25).
+    """
+    screen_width, screen_height = pyautogui.size()
+    scaling_factor = get_scaling_factor()
+
+    # Compute the base absolute coordinates.
+    base_x = x_percent * screen_width * scaling_factor
+    base_y = y_percent * screen_height * scaling_factor
+
+    # Adjust: divide x-coordinate and multiply y-coordinate.
+    adjusted_x = int(base_x / x_divisor)
+    adjusted_y = int(base_y * y_multiplier)
+
+    print(
+        f"Clicking at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen with scaling factor {scaling_factor}")
+    pyautogui.click(adjusted_x, adjusted_y)
+
+
+def operate(operations, session_id, model=None):
+    """
+    Processes a list of operations and executes them.
+    Supports click, doubleclick, write, press, wait, and done operations.
+    For click operations, it uses the adjusted coordinate conversion:
+    - x-coordinate divided by 1.50.
+    - y-coordinate multiplied by 1.25.
+    """
+    import time
+
+    for op in operations:
+        if op.get("operation") in ["click", "doubleclick"]:
+            try:
+                x_percent = float(op.get("x", 0))
+                y_percent = float(op.get("y", 0))
+                screen_width, screen_height = pyautogui.size()
+                scaling_factor = get_scaling_factor()
+
+                # Compute the base absolute coordinates.
+                base_x = x_percent * screen_width * scaling_factor
+                base_y = y_percent * screen_height * scaling_factor
+
+                # Adjust: divide x-coordinate and multiply y-coordinate.
+                adjusted_x = int(base_x / 1.50)
+                adjusted_y = int(base_y * 1.25)
+
+                print(
+                    f"{'Double-clicking' if op.get('operation') == 'doubleclick' else 'Clicking'} "
+                    f"at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen "
+                    f"with scaling factor {scaling_factor}"
+                )
+
+                if op.get("operation") == "doubleclick":
+                    pyautogui.doubleClick(adjusted_x, adjusted_y)
+                else:
+                    pyautogui.click(adjusted_x, adjusted_y)
+            except Exception as e:
+                print(
+                    f"Error performing {'double-click' if op.get('operation') == 'doubleclick' else 'click'} operation:",
+                    e)
+
+        elif op.get("operation") == "write":
+            content = op.get("content", "")
+            pyautogui.write(content)
+
+        elif op.get("operation") == "press":
+            keys = op.get("keys", [])
+            for key in keys:
+                pyautogui.press(key)
+
+        elif op.get("operation") == "wait":
+            duration = float(op.get("duration", 1))
             time.sleep(duration)
-        else:
-            print(
-                f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}"
-            )
-            print(
-                f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response {ANSI_RESET}{operation}"
-            )
-            return True
 
-        print(
-            f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]"
-        )
-        print(f"{operate_thought}")
-        print(f"{ANSI_BLUE}Action: {ANSI_RESET}{operate_type} {operate_detail}\n")
+        elif op.get("operation") == "done":
+            print("Operation completed:", op.get("summary", ""))
+            return True  # Stop processing further operations
 
-    return False
+    return False  # Continue processing

From 6c1c01516b674b63e33ca45ee54ab44d1396a1d8 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 17:14:32 +0100
Subject: [PATCH 24/37] Update apis.py

added reliable claude 3.7 usability
---
 operate/models/apis.py | 493 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 482 insertions(+), 11 deletions(-)

diff --git a/operate/models/apis.py b/operate/models/apis.py
index 22afd718..7db63ac1 100644
--- a/operate/models/apis.py
+++ b/operate/models/apis.py
@@ -142,13 +142,373 @@ def call_gpt_4o(messages):
         return call_gpt_4o(messages)
 
 
+def extract_target_from_text(text):
+    """
+    Extract target file/folder names from text with intelligent priority.
+
+    Args:
+        text (str): Text to analyze (thought or operation text)
+
+    Returns:
+        str: The extracted target description
+    """
+    import re
+
+    # Priority 1: Look for quoted text which often indicates file/folder names
+    quoted_pattern = re.compile(r"['\"]([^'\"]+)['\"]")
+    quoted_matches = quoted_pattern.findall(text)
+    if quoted_matches:
+        return quoted_matches[0]
+
+    # Priority 2: Look for file/folder patterns (word-word or words with extensions)
+    file_pattern = re.compile(r"(\w+[-\.]\w+[-\.]\w+|\w+[-\.]\w+)")
+    file_matches = file_pattern.findall(text)
+    for match in file_matches:
+        # Filter out things that don't look like folder/file names
+        if any(x in match.lower() for x in ['-main', 'folder', 'file', 'image', 'doc', '.', 'sbc']):
+            return match
+
+    # Priority 3: Look for phrases after "click on X" or "open X"
+    click_phrases = ["click on ", "click the ", "clicking on ", "clicking the ", "open ", "opening "]
+    for phrase in click_phrases:
+        if phrase in text.lower():
+            parts = text.lower().split(phrase, 1)
+            if len(parts) > 1:
+                # Extract up to a period, comma, or space
+                target = parts[1].split(".")[0].split(",")[0].strip()
+                # Only return if it's not too long (likely not a file name if very long)
+                if 2 <= len(target.split()) <= 5:
+                    return target
+
+    # Priority 4: Look for capitalized words which might be file/folder names
+    cap_word_pattern = re.compile(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b')
+    cap_matches = cap_word_pattern.findall(text)
+    if cap_matches:
+        # Filter to likely file/folder names
+        likely_matches = [m for m in cap_matches if len(m) > 3]
+        if likely_matches:
+            return likely_matches[0]
+
+    # Default: just return the original text if nothing better found
+    return text
+
+
+def find_ui_element_by_text_and_vision(target_description, screenshot_filename):
+    """
+    Finds UI elements using multiple methods: text OCR, template matching, and shape detection.
+    Specialized for finding desktop icons, folders, and common UI elements.
+
+    Args:
+        target_description (str): Description of what we're trying to find (e.g., "sbc-images-main")
+        screenshot_filename (str): Path to screenshot file
+
+    Returns:
+        tuple: (x_percent, y_percent) coordinates as percentages of screen width/height, or None if not found
+    """
+    import cv2
+    import numpy as np
+    from PIL import Image
+    import easyocr
+    import os
+    import re
+
+    # Clean up the target description for better matching
+    target_words = target_description.lower().split()
+    # Remove common words that don't help with identification
+    stop_words = ['the', 'a', 'an', 'to', 'on', 'in', 'by', 'it', 'this', 'that', 'for', 'with', 'click', 'double']
+    target_words = [word for word in target_words if word not in stop_words]
+    clean_target = ' '.join(target_words)
+
+    print(f"[Target Finder] Looking for: '{clean_target}'")
+
+    # Load the screenshot
+    screenshot = Image.open(screenshot_filename)
+    screenshot_np = np.array(screenshot)
+    screenshot_rgb = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR)
+
+    # Create a debug image to visualize findings
+    debug_img = screenshot_rgb.copy()
+
+    # Results will store all potential matches with their confidence scores
+    results = []
+
+    # APPROACH 1: Template matching with saved templates
+    icon_folder = "icon_templates"
+    if os.path.exists(icon_folder) and any(os.listdir(icon_folder)):
+        for filename in os.listdir(icon_folder):
+            if filename.endswith(('.png', '.jpg')):
+                # Extract the template name for matching
+                template_name = filename.replace('_', ' ').replace('.png', '').replace('.jpg', '')
+
+                # Check if template name matches any part of the target
+                if any(word in template_name.lower() for word in target_words) or \
+                        any(word in clean_target for word in template_name.lower().split()):
+
+                    template_path = os.path.join(icon_folder, filename)
+                    template = cv2.imread(template_path)
+
+                    if template is None:
+                        continue
+
+                    # Apply template matching
+                    res = cv2.matchTemplate(screenshot_rgb, template, cv2.TM_CCOEFF_NORMED)
+                    min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)
+
+                    if max_val > 0.7:  # Good match
+                        template_h, template_w = template.shape[:2]
+                        top_left = max_loc
+                        bottom_right = (top_left[0] + template_w, top_left[1] + template_h)
+                        center_x = top_left[0] + template_w // 2
+                        center_y = top_left[1] + template_h // 2
+
+                        # Add to results with high confidence since it's a template match
+                        match_score = max_val * 1.5  # Boost template matches
+                        results.append({
+                            "type": "template",
+                            "confidence": match_score,
+                            "center": (center_x, center_y),
+                            "bbox": (top_left[0], top_left[1], bottom_right[0], bottom_right[1])
+                        })
+
+                        # Draw on debug image
+                        cv2.rectangle(debug_img, top_left, bottom_right, (0, 255, 0), 2)
+                        cv2.putText(debug_img, f"Template: {template_name} ({match_score:.2f})",
+                                    (top_left[0], top_left[1] - 10),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
+
+    # APPROACH 2: OCR text detection
+    try:
+        # Initialize EasyOCR Reader
+        reader = easyocr.Reader(["en"])
+
+        # Read the screenshot
+        ocr_results = reader.readtext(screenshot_filename)
+
+        for idx, (bbox, text, conf) in enumerate(ocr_results):
+            text_lower = text.lower()
+
+            # Check for any word match
+            word_match = False
+            for word in target_words:
+                if len(word) > 2 and word in text_lower:  # Avoid matching very short words
+                    word_match = True
+                    break
+
+            # Calculate match score based on text similarity
+            if word_match or clean_target in text_lower or text_lower in clean_target:
+                # Calculate match score
+                from difflib import SequenceMatcher
+                similarity = SequenceMatcher(None, clean_target, text_lower).ratio()
+                match_score = similarity * conf
+
+                # Especially boost exact matches or strong partial matches
+                if similarity > 0.8:
+                    match_score *= 1.5
+
+                # Get center of text bounding box
+                bbox_points = np.array(bbox).astype(int)
+                center_x = np.mean([p[0] for p in bbox_points])
+                center_y = np.mean([p[1] for p in bbox_points])
+
+                # Calculate bounding box rectangle
+                x_points = [p[0] for p in bbox_points]
+                y_points = [p[1] for p in bbox_points]
+                bbox_rect = (min(x_points), min(y_points), max(x_points), max(y_points))
+
+                # Add to results
+                results.append({
+                    "type": "text",
+                    "text": text,
+                    "confidence": match_score,
+                    "center": (center_x, center_y),
+                    "bbox": bbox_rect
+                })
+
+                # Draw on debug image
+                top_left = (int(bbox_rect[0]), int(bbox_rect[1]))
+                bottom_right = (int(bbox_rect[2]), int(bbox_rect[3]))
+                cv2.rectangle(debug_img, top_left, bottom_right, (0, 0, 255), 2)
+                cv2.putText(debug_img, f"OCR: {text} ({match_score:.2f})",
+                            (top_left[0], top_left[1] - 10),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
+
+                # For text results, look for potential UI elements above (desktop icon case)
+                # If this looks like a desktop icon label, the actual icon is likely above it
+                if any(word in text_lower for word in ['folder', 'file', 'image', 'doc']) or \
+                        re.search(r'\w+[-\.]\w+', text_lower) or \
+                        "sbc" in text_lower:
+                    # Define a region above the text to look for the icon
+                    icon_area_width = bbox_rect[2] - bbox_rect[0]
+                    icon_area_height = icon_area_width  # Make it square
+                    icon_area_top = max(0, bbox_rect[1] - icon_area_height - 10)  # Above text with a small gap
+                    icon_area_left = bbox_rect[0]
+
+                    icon_center_x = icon_area_left + icon_area_width // 2
+                    icon_center_y = icon_area_top + icon_area_height // 2
+
+                    # Add this as a potential icon location with boosted confidence
+                    icon_match_score = match_score * 1.2  # Boost confidence for icon targets
+                    results.append({
+                        "type": "icon",
+                        "confidence": icon_match_score,
+                        "center": (icon_center_x, icon_center_y),
+                        "bbox": (icon_area_left, icon_area_top,
+                                 icon_area_left + icon_area_width, icon_area_top + icon_area_height)
+                    })
+
+                    # Draw the potential icon area
+                    cv2.rectangle(debug_img,
+                                  (int(icon_area_left), int(icon_area_top)),
+                                  (int(icon_area_left + icon_area_width), int(icon_area_top + icon_area_height)),
+                                  (255, 0, 0), 2)
+                    cv2.putText(debug_img, f"Icon target ({icon_match_score:.2f})",
+                                (int(icon_area_left), int(icon_area_top) - 10),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
+
+    except Exception as e:
+        print(f"[Target Finder] OCR detection error: {e}")
+
+    # APPROACH 3: Folder icon detection (color/shape based)
+    if "folder" in clean_target or "file" in clean_target or "sbc" in clean_target:
+        try:
+            # Convert to HSV for better color segmentation
+            hsv = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2HSV)
+
+            # Define color ranges for common folder icons (yellow folders in Windows)
+            lower_yellow = np.array([20, 100, 100])
+            upper_yellow = np.array([40, 255, 255])
+
+            # Create mask for yellow color
+            mask = cv2.inRange(hsv, lower_yellow, upper_yellow)
+
+            # Find contours in the mask
+            contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+            # Filter contours by size (folder icons are usually of similar size)
+            min_area = 100
+            max_area = 5000
+
+            for contour in contours:
+                area = cv2.contourArea(contour)
+                if min_area < area < max_area:
+                    # Get center of contour
+                    M = cv2.moments(contour)
+                    if M["m00"] > 0:
+                        center_x = int(M["m10"] / M["m00"])
+                        center_y = int(M["m01"] / M["m00"])
+
+                        # Get bounding box
+                        x, y, w, h = cv2.boundingRect(contour)
+
+                        # Add to results with lower confidence for shape-based detection
+                        match_score = 0.5  # Base confidence for shape detection
+                        results.append({
+                            "type": "shape",
+                            "confidence": match_score,
+                            "center": (center_x, center_y),
+                            "bbox": (x, y, x + w, y + h)
+                        })
+
+                        # Draw on debug image
+                        cv2.rectangle(debug_img, (x, y), (x + w, y + h), (255, 255, 0), 2)
+                        cv2.putText(debug_img, f"Shape ({match_score:.2f})",
+                                    (x, y - 10),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2)
+        except Exception as e:
+            print(f"[Target Finder] Shape detection error: {e}")
+
+    # Save the debug image
+    cv2.imwrite("debug_target_detection.jpg", debug_img)
+
+    if results:
+        # Sort by confidence
+        results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+        best_match = results[0]
+
+        # Print debug info
+        print(f"[Target Finder] Best match: {best_match['type']} with confidence {best_match['confidence']:.2f}")
+
+        # Get the center point
+        center_x, center_y = best_match["center"]
+
+        # Convert to percentage of screen size
+        screen_width, screen_height = screenshot.size
+        x_percent = center_x / screen_width
+        y_percent = center_y / screen_height
+
+        # Mark the final target on the debug image
+        result_img = cv2.circle(debug_img, (int(center_x), int(center_y)), 10, (0, 255, 255), -1)
+        cv2.imwrite("debug_final_target.jpg", result_img)
+
+        return (x_percent, y_percent)
+
+    print(f"[Target Finder] No match found for '{clean_target}'")
+    return None
+
+
+def verify_success(screenshot_before, task_type="open_folder"):
+    """
+    Verifies if an operation was successful by comparing before/after screenshots.
+
+    Args:
+        screenshot_before: Screenshot taken before the operation
+        task_type: Type of task we're verifying (open_folder, click_button, etc.)
+
+    Returns:
+        bool: True if operation appears successful, False otherwise
+    """
+    import cv2
+    import numpy as np
+    import pyautogui
+
+    # Take a screenshot after the operation
+    screenshot_after = pyautogui.screenshot()
+
+    # Convert to numpy arrays for comparison
+    before_np = np.array(screenshot_before)
+    after_np = np.array(screenshot_after)
+
+    # Resize if dimensions don't match
+    if before_np.shape != after_np.shape:
+        after_np = cv2.resize(after_np, (before_np.shape[1], before_np.shape[0]))
+
+    # For opening a folder, check for significant window change
+    if task_type == "open_folder":
+        # Calculate difference between images
+        diff = cv2.absdiff(before_np, after_np)
+        gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
+        _, thresholded = cv2.threshold(gray_diff, 30, 255, cv2.THRESH_BINARY)
+
+        # Calculate percentage of changed pixels
+        changed_pixels = np.count_nonzero(thresholded)
+        total_pixels = thresholded.size
+        change_percentage = (changed_pixels / total_pixels) * 100
+
+        # Save debug images
+        cv2.imwrite("debug_before.jpg", cv2.cvtColor(before_np, cv2.COLOR_RGB2BGR))
+        cv2.imwrite("debug_after.jpg", cv2.cvtColor(after_np, cv2.COLOR_RGB2BGR))
+        cv2.imwrite("debug_diff.jpg", thresholded)
+
+        print(f"[Verification] Screen change: {change_percentage:.2f}%")
+
+        # If significant portion of screen changed, likely a new window opened
+        return change_percentage > 15
+
+    return False
+
+
 def call_claude_37(messages):
     if config.verbose:
         print("[call_claude_37]")
     time.sleep(1)
 
-    # Import the anthropic module inside the function to ensure it's available
+    # Import all required modules
     import anthropic
+    import cv2
+    import numpy as np
+    import re
+    import pyautogui
+    from PIL import Image
 
     try:
         screenshots_dir = "screenshots"
@@ -243,17 +603,129 @@ def call_claude_37(messages):
 
         # Extract the content from the response
         content = response.content[0].text
-        content = clean_json(content)
-
-        # Create assistant message
-        assistant_message = {"role": "assistant", "content": content}
 
+        # Check if Claude added text before the JSON
+        if content.strip().startswith("[") or content.strip().startswith("{"):
+            # Content is already in JSON format, just clean it
+            content = clean_json(content)
+        else:
+            # Claude might have added a message before the JSON
+            # Try to find JSON in the content
+            json_match = re.search(r'(\[.*\]|\{.*\})', content, re.DOTALL)
+            if json_match:
+                # Extract the JSON part
+                content = clean_json(json_match.group(1))
+            else:
+                # If no JSON found, try to create a done operation
+                if "done" in content.lower() or "complete" in content.lower():
+                    content = '[{"thought": "Task complete", "operation": "done"}]'
+                else:
+                    # Create a fallback operation
+                    content = '[{"thought": "Continuing task", "operation": "wait", "duration": 1}]'
+
+        # Log the cleaned content
         if config.verbose:
-            print("[call_claude_37] content", content)
+            print("[call_claude_37] cleaned content", content)
 
-        content = json.loads(content)
+        # Create assistant message with the original response
+        assistant_message = {"role": "assistant", "content": response.content[0].text}
+
+        try:
+            # Try to parse as JSON
+            parsed_content = json.loads(content)
+            if config.verbose:
+                print("[call_claude_37] Successfully parsed content as JSON")
+        except json.JSONDecodeError as e:
+            # If JSON parsing fails, create a simple operation
+            print(f"[call_claude_37] JSON parsing failed: {e}. Creating fallback operation.")
+            parsed_content = [{"thought": "Continuing with task", "operation": "wait", "duration": 1}]
+
+        # Process the operations with enhanced handling
+        processed_content = []
+
+        # Check if Claude is trying to do a double-click
+        need_double_click = False
+        for operation in parsed_content:
+            if operation.get("double_click", False):
+                need_double_click = True
+                break
+            if "thought" in operation:
+                if "double" in operation["thought"].lower() and "click" in operation["thought"].lower():
+                    need_double_click = True
+                    break
+
+        for i, operation in enumerate(parsed_content):
+            if operation.get("operation") == "click":
+                # Extract target description
+                target_description = ""
+                if "text" in operation:
+                    target_description = operation.get("text")
+                elif "thought" in operation:
+                    # Try to extract what we're clicking on from the thought
+                    thought = operation.get("thought", "")
+
+                    # Look for quoted text first
+                    quoted_match = re.search(r'[\'"]([^\'\"]+)[\'"]', thought)
+                    if quoted_match:
+                        target_description = quoted_match.group(1)
+                    else:
+                        # Look for instances of "sbc-images-main" or similar patterns
+                        pattern_match = re.search(r'(\b\w+-\w+-\w+\b|\bsbc[- ]\w+\b)', thought, re.IGNORECASE)
+                        if pattern_match:
+                            target_description = pattern_match.group(1)
+                        else:
+                            # Fall back to looking for phrases after click indicators
+                            click_indicators = ["click on", "click the", "clicking on", "clicking the"]
+                            for indicator in click_indicators:
+                                if indicator in thought.lower():
+                                    parts = thought.lower().split(indicator, 1)
+                                    if len(parts) > 1:
+                                        target_description = parts[1].split(".")[0].split(",")[0].strip()
+                                        break
+
+                if not target_description:
+                    target_description = f"target at position ({operation['x']}, {operation['y']})"
+
+                if config.verbose:
+                    print(f"[call_claude_37] Target description: {target_description}")
+
+                # Handle double-clicking if detected
+                if need_double_click and i == 0:  # Only process the first click for double-click
+                    # Extract coordinates
+                    try:
+                        x = operation["x"]
+                        y = operation["y"]
+
+                        # Add a special marker to signal double-click
+                        operation["double_click"] = True
+
+                        # Log the double-click intention
+                        print(
+                            f"[call_claude_37] Detected double-click operation on '{target_description}' at ({x}, {y})")
+                    except Exception as e:
+                        print(f"[call_claude_37] Error processing double-click: {e}")
+
+                # For double-click operations, we only need to add the first click
+                # Skip adding second clicks to avoid duplicate operations
+                if need_double_click and i > 0:
+                    if config.verbose:
+                        print("[call_claude_37] Skipping duplicate click for double-click operation")
+                    continue
+
+                # Add the operation
+                if config.verbose:
+                    print(f"[call_claude_37] Adding operation: {operation}")
+
+                processed_content.append(operation)
+            else:
+                # For non-click operations, just append as is
+                processed_content.append(operation)
+
+        # Add the assistant message to the history
         messages.append(assistant_message)
-        return content
+
+        # Return the processed content
+        return processed_content if processed_content else [{"operation": "wait", "duration": 1}]
 
     except Exception as e:
         error_msg = str(e)
@@ -275,9 +747,8 @@ def call_claude_37(messages):
         if config.verbose:
             traceback.print_exc()
 
-        # Fall back to GPT-4o
-        return call_gpt_4o(messages)
-
+        # If an exception occurs, return a simple operation to keep things moving
+        return [{"thought": "Continuing task after error", "operation": "wait", "duration": 1}]
 async def call_qwen_vl_with_ocr(messages, objective, model):
     if config.verbose:
         print("[call_qwen_vl_with_ocr]")

From 5d55d53eee358d7e1d51d4be39c655554a8abac8 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 17:15:35 +0100
Subject: [PATCH 25/37] Update prompts.py

added double click functionality
---
 operate/models/prompts.py | 88 +++++++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 23 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index f8ddad18..ad660686 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -13,28 +13,34 @@
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click
 ```
 [{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
 ```
 
-2. write - Write with your keyboard
+2. doubleclick - Move mouse and double click
+```
+[{{ "thought": "write a thought here", "operation": "doubleclick", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
+```
+
+3. write - Write with your keyboard
 ```
 [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
 ```
 
-3. press - Use a hotkey or press key to operate the computer
+4. press - Use a hotkey or press key to operate the computer
 ```
 [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
 ```
 
-4. done - The objective is completed
+5. done - The objective is completed
 ```
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
-5. wait - Wait some time for a page to load
+
+6. wait - Wait some time for a page to load
 ```
 [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
@@ -52,7 +58,14 @@
 ]
 ```
 
-Example 2: Focuses on the address bar in a browser before typing a website
+Example 2: Double-clicking to open a file or application
+```
+[
+    {{ "thought": "I want to open a file or application by double-clicking", "operation": "doubleclick", "x": "0.50", "y": "0.60" }}
+]
+```
+
+Example 3: Focuses on the address bar in a browser before typing a website
 ```
 [
     {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
@@ -61,7 +74,7 @@
 ]
 ```
 
-Example 3: Waits to the page to load before proceeding to interact
+Example 4: Waits to the page to load before proceeding to interact
 ```
 [
     {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5"}},
@@ -83,27 +96,34 @@
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
 ```
 [{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}]  # 'percent' refers to the percentage of the screen's dimensions in decimal format
 ```
-2. write - Write with your keyboard
+
+2. doubleclick - Move mouse and double click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
+```
+[{{ "thought": "write a thought here", "operation": "doubleclick", "label": "~x" }}]
+```
+
+3. write - Write with your keyboard
 ```
 [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
 ```
-3. press - Use a hotkey or press key to operate the computer
+
+4. press - Use a hotkey or press key to operate the computer
 ```
 [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
 ```
 
-4. done - The objective is completed
+5. done - The objective is completed
 ```
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
 
-5. wait - Wait some time for a page to load
+6. wait - Wait some time for a page to load
 ```
 [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
@@ -120,7 +140,14 @@
 ]
 ```
 
-Example 2: Focuses on the address bar in a browser before typing a website
+Example 2: Double-clicking to open a file or application with a labeled element
+```
+[
+    {{ "thought": "I want to open a file or application by double-clicking on its labeled element", "operation": "doubleclick", "label": "~42" }}
+]
+```
+
+Example 3: Focuses on the address bar in a browser before typing a website
 ```
 [
     {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
@@ -129,7 +156,7 @@
 ]
 ```
 
-Example 3: Send a "Hello World" message in the chat
+Example 4: Send a "Hello World" message in the chat
 ```
 [
     {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }},
@@ -137,7 +164,7 @@
 ]
 ```
 
-Example 4: Waits to the page to load before proceeding to interact
+Example 5: Waits to the page to load before proceeding to interact
 ```
 [
     {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }},
@@ -154,32 +181,39 @@
 """
 
 
-# TODO: Add an example or instruction about `Action: press ['pagedown']` to scroll
 SYSTEM_PROMPT_OCR = """
 You are operating a {operating_system} computer, using the same operating system as a human.
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method.
 ```
 [{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}]  
 ```
-2. write - Write with your keyboard
+
+2. doubleclick - Move mouse and double click - Look for text to double click
+```
+[{{ "thought": "write a thought here", "operation": "doubleclick", "text": "The text in the item to double click" }}]  
+```
+
+3. write - Write with your keyboard
 ```
 [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
 ```
-3. press - Use a hotkey or press key to operate the computer
+
+4. press - Use a hotkey or press key to operate the computer
 ```
 [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
 ```
-4. done - The objective is completed
+
+5. done - The objective is completed
 ```
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
 
-5. wait - Wait some time for a page to load
+6. wait - Wait some time for a page to load
 ```
 [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
@@ -206,7 +240,14 @@
 ]
 ```
 
-Example 3: Search for someone on Linkedin when already on linkedin.com
+Example 3: Double-clicking to open a file
+```
+[
+    {{ "thought": "I want to open a file by finding its text label and double-clicking", "operation": "doubleclick", "text": "my_document.txt" }}
+]
+```
+
+Example 4: Search for someone on Linkedin when already on linkedin.com
 ```
 [
     {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }},
@@ -214,7 +255,8 @@
     {{ "thought": "Finally I'll submit the search form with enter", "operation": "press", "keys": ["enter"] }}
 ]
 ```
-Example 4: Waits to the page to load before proceeding to interact
+
+Example 5: Waits to the page to load before proceeding to interact
 ```
 [
     {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }},

From 40523ef4ecc701f1a63c5ca28b97452cc60ac1f8 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 17:16:29 +0100
Subject: [PATCH 26/37] Update config.py


From 28272bb69bcc103aa2b83925de3c2eb7babee1c5 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 17:19:17 +0100
Subject: [PATCH 27/37] A icon library for computer vision

You can place the mouse over an icon and hit enter to make a screenshot of the icons area, the png will be saved on a folder for further use in guideing the SOC through icons.
---
 operate/setup_icon_templates.py | 117 ++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 operate/setup_icon_templates.py

diff --git a/operate/setup_icon_templates.py b/operate/setup_icon_templates.py
new file mode 100644
index 00000000..d9ad620b
--- /dev/null
+++ b/operate/setup_icon_templates.py
@@ -0,0 +1,117 @@
+import os
+import pyautogui
+import time
+import tkinter as tk
+from tkinter import simpledialog, messagebox
+
+
+def setup_icon_templates():
+    """
+    Simplified helper script to set up icon templates for visual target finding.
+    Uses simple coordinate input rather than visual selection.
+    """
+    # Create templates directory if it doesn't exist
+    template_dir = "icon_templates"
+    if not os.path.exists(template_dir):
+        os.makedirs(template_dir)
+        print(f"Created directory: {template_dir}")
+
+    # Create a simple GUI for capturing templates
+    root = tk.Tk()
+    root.title("Icon Template Capture Tool")
+    root.geometry("400x200")
+
+    # Function to capture icon at cursor position
+    def capture_at_cursor():
+        icon_name = simpledialog.askstring("Icon Name", "Enter name for this icon/folder:", parent=root)
+        if not icon_name:
+            return
+
+        # Give user time to position cursor
+        messagebox.showinfo("Capture Icon",
+                            "Position your mouse cursor over the center of the icon you want to capture, then click OK.")
+
+        # Get cursor position
+        time.sleep(0.5)  # Small delay after dialog closes
+        x, y = pyautogui.position()
+
+        # Capture region around cursor (100x100 pixels)
+        region_size = 50  # pixels in each direction from center
+        region = (x - region_size, y - region_size, region_size * 2, region_size * 2)
+
+        try:
+            # Capture the region
+            screenshot = pyautogui.screenshot(region=region)
+
+            # Save the template
+            filename = f"{icon_name.replace(' ', '_').lower()}.png"
+            filepath = os.path.join(template_dir, filename)
+            screenshot.save(filepath)
+
+            messagebox.showinfo("Success", f"Saved template as {filename}")
+            print(f"Saved template as {filepath}")
+        except Exception as e:
+            messagebox.showerror("Error", f"Failed to capture: {str(e)}")
+            print(f"Error: {e}")
+
+    # Function to capture custom region
+    def capture_custom_region():
+        icon_name = simpledialog.askstring("Icon Name", "Enter name for this icon/folder:", parent=root)
+        if not icon_name:
+            return
+
+        # Ask for region coordinates
+        try:
+            x = simpledialog.askinteger("X Coordinate", "Enter X coordinate (left edge):", parent=root)
+            if x is None: return
+
+            y = simpledialog.askinteger("Y Coordinate", "Enter Y coordinate (top edge):", parent=root)
+            if y is None: return
+
+            width = simpledialog.askinteger("Width", "Enter width in pixels:", parent=root, minvalue=10, maxvalue=500)
+            if width is None: return
+
+            height = simpledialog.askinteger("Height", "Enter height in pixels:", parent=root, minvalue=10,
+                                             maxvalue=500)
+            if height is None: return
+
+            # Capture the specified region
+            region = (x, y, width, height)
+            screenshot = pyautogui.screenshot(region=region)
+
+            # Save the template
+            filename = f"{icon_name.replace(' ', '_').lower()}.png"
+            filepath = os.path.join(template_dir, filename)
+            screenshot.save(filepath)
+
+            messagebox.showinfo("Success", f"Saved template as {filename}")
+            print(f"Saved template as {filepath}")
+        except Exception as e:
+            messagebox.showerror("Error", f"Failed to capture: {str(e)}")
+            print(f"Error: {e}")
+
+    # Create and place buttons
+    label = tk.Label(root, text="Icon Template Capture Tool", font=("Arial", 14))
+    label.pack(pady=10)
+
+    instructions = tk.Label(root, text="Choose a capture method:")
+    instructions.pack(pady=5)
+
+    button_frame = tk.Frame(root)
+    button_frame.pack(pady=10)
+
+    cursor_btn = tk.Button(button_frame, text="Capture at Cursor", command=capture_at_cursor, width=20)
+    cursor_btn.grid(row=0, column=0, padx=10, pady=5)
+
+    region_btn = tk.Button(button_frame, text="Specify Region Manually", command=capture_custom_region, width=20)
+    region_btn.grid(row=0, column=1, padx=10, pady=5)
+
+    close_btn = tk.Button(root, text="Close", command=root.destroy, width=10)
+    close_btn.pack(pady=10)
+
+    # Start the GUI
+    root.mainloop()
+
+
+if __name__ == "__main__":
+    setup_icon_templates()
\ No newline at end of file

From 6987cdcfddb0c617d4855b00c9320dc461354c6b Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sat, 1 Mar 2025 21:17:02 +0100
Subject: [PATCH 28/37] Update README.md

Small spelling correction
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cb92f29f..9e53992b 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ operate -m o1-with-ocr
 ### Multimodal Models  `-m`
 
 #### Try claude 3.7 `-m claude-3.7`
-Use Clude 3.7 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Antheopic dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. 
+Use Clude 3.7 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Anthropic dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. 
 
 ```
 operate -m claude-3.7

From dad71cf9aca2eef38d69c6df5d5e77b6db0736f4 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sun, 2 Mar 2025 21:41:44 +0100
Subject: [PATCH 29/37] Add files via upload

---
 GUI_README.md | 121 +++++++++++
 gui.py        | 563 ++++++++++++++++++++++++++++++++++++++++++++++++++
 gui_main.py   | 105 ++++++++++
 3 files changed, 789 insertions(+)
 create mode 100644 GUI_README.md
 create mode 100644 gui.py
 create mode 100644 gui_main.py

diff --git a/GUI_README.md b/GUI_README.md
new file mode 100644
index 00000000..c819af77
--- /dev/null
+++ b/GUI_README.md
@@ -0,0 +1,121 @@
+# Self-Operating Computer GUI
+
+A graphical user interface for the Self-Operating Computer, allowing easy interaction with AI models to automate computer tasks.
+
+## Features
+
+- **Intuitive Chat Interface**: Communicate with the Self-Operating Computer through a familiar chat interface
+- **Live Screenshot Preview**: See what the AI sees in real-time
+- **Model Selection**: Choose from multiple AI models including GPT-4, Claude, Qwen, and more
+- **Voice Control**: Speak your commands using the built-in voice recognition (requires whisper_mic)
+- **Real-time Logs**: Monitor detailed logs of operations in real-time
+- **Multi-platform**: Works on Windows, macOS, and Linux
+
+## Installation
+
+### Prerequisites
+
+- Python 3.8 or higher
+- Self-Operating Computer installed and configured
+- pip (Python package manager)
+
+### Required Packages
+
+```bash
+pip install PyQt5
+pip install whisper_mic  # Optional, for voice commands
+```
+
+## Usage
+
+### Running the GUI
+
+From the Self-Operating Computer directory:
+
+```bash
+python gui_main.py
+```
+
+### Command Line Options
+
+```
+usage: gui_main.py [-h] [-m MODEL] [--verbose] [--light]
+
+Run the Self-Operating Computer GUI with a specified model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        Specify the default model to use
+  --verbose             Run with verbose logging
+  --light               Use light mode instead of dark mode
+```
+
+### Examples
+
+```bash
+# Run with GPT-4 model and verbose logging
+python gui_main.py -m gpt-4-vision --verbose
+
+# Run with Claude 3 model in light mode
+python gui_main.py -m claude-3 --light
+```
+
+## Interface Guide
+
+The GUI is divided into several sections:
+
+1. **Top Bar**: Contains model selection dropdown and verbose mode toggle
+2. **Left Panel**: Displays the current screenshot that the AI sees
+3. **Right Panel - Top**: Chat history showing your requests and system messages
+4. **Right Panel - Bottom**: Detailed logs of operations in real-time
+5. **Bottom Input**: Text field for typing tasks, Send button, and voice recording button
+
+## Model Support
+
+The GUI supports all models that the Self-Operating Computer supports:
+
+- GPT-4 Vision
+- GPT-4 with SOM (Spatial Object Memory)
+- GPT-4 with OCR
+- Claude 3
+- Claude 3.7
+- Qwen-VL
+- O1 with OCR
+- Gemini Pro Vision
+- LLaVA
+
+## API Keys
+
+The GUI uses the same API key configuration as the main Self-Operating Computer. If a required API key is missing, a prompt will appear asking you to enter it.
+
+## Troubleshooting
+
+### Voice Recognition Not Working
+
+Make sure you have installed whisper_mic:
+```bash
+pip install whisper_mic
+```
+
+### GUI Not Launching
+
+Check that PyQt5 is properly installed:
+```bash
+pip install PyQt5
+```
+
+### Model Not Responding
+
+Ensure your API keys are properly configured in the Self-Operating Computer settings.
+
+## Integration with Existing Codebase
+
+The GUI integrates seamlessly with the existing Self-Operating Computer codebase:
+
+- It uses the same `operate.py` functions for executing tasks
+- It leverages the same model APIs from `apis.py`
+- It inherits configuration from `config.py`
+- It preserves the same prompt formats from `prompts.py`
+
+The UI simply provides a graphical wrapper around these core components, making them more accessible to users who prefer not to use the comman
\ No newline at end of file
diff --git a/gui.py b/gui.py
new file mode 100644
index 00000000..ac176bfc
--- /dev/null
+++ b/gui.py
@@ -0,0 +1,563 @@
+import sys
+import os
+import time
+import threading
+import asyncio
+import platform
+import json
+import base64
+from PyQt5.QtWidgets import QSizePolicy
+from PyQt5.QtWidgets import (
+    QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
+    QTextEdit, QLineEdit, QPushButton, QComboBox, QCheckBox,
+    QLabel, QScrollArea, QFrame, QSplitter, QMessageBox, QProgressBar
+)
+from PyQt5.QtCore import Qt, QThread, pyqtSignal, pyqtSlot, QSize, QTimer
+from PyQt5.QtGui import QFont, QIcon, QTextCursor, QColor, QPalette, QPixmap
+
+# Import directly from local modules
+from operate.models.prompts import USER_QUESTION, get_system_prompt
+from operate.config import Config
+from operate.models.apis import get_next_action
+from operate.utils.screenshot import capture_screen_with_cursor
+from operate.exceptions import ModelNotRecognizedException
+from operate.operate import operate, get_scaling_factor
+
+# Setup config
+config = Config()
+
+# Define available models - match the models in apis.py
+AVAILABLE_MODELS = [
+    "gpt-4-vision",
+    "gpt-4-with-som",
+    "gpt-4-with-ocr",
+    "claude-3",
+    "claude-3.7",
+    "qwen-vl",
+    "o1-with-ocr",
+    "gemini-pro-vision",
+    "llava"
+]
+
+
+class LogRedirector:
+    """Redirects print output to the GUI log window"""
+
+    def __init__(self, text_widget):
+        self.text_widget = text_widget
+        self.original_stdout = sys.stdout
+        self.original_stderr = sys.stderr
+
+    def write(self, text):
+        self.original_stdout.write(text)
+        self.text_widget.append(text)
+        # Auto-scroll to bottom
+        self.text_widget.moveCursor(QTextCursor.End)
+
+    def flush(self):
+        self.original_stdout.flush()
+        QApplication.processEvents()
+
+
+class RecordButton(QPushButton):
+    """Custom button for voice recording that changes appearance when pressed"""
+
+    def __init__(self, parent=None):
+        super().__init__(parent)
+        self.setText("Hold to Record")
+        self.setCheckable(True)
+        self.setStyleSheet("""
+            QPushButton {
+                background-color: #f0f0f0;
+                border: 2px solid #c0c0c0;
+                border-radius: 15px;
+                padding: 8px;
+                color: #404040;
+            }
+            QPushButton:checked {
+                background-color: #ff4444;
+                color: white;
+                border: 2px solid #dd2222;
+            }
+        """)
+        self.mic = None
+
+
+class ScreenshotDisplay(QLabel):
+    """Widget to display the current screenshot"""
+
+    def __init__(self, parent=None):
+        super().__init__(parent)
+        self.setAlignment(Qt.AlignCenter)
+        self.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding)
+        self.setMinimumHeight(200)
+        self.setStyleSheet("background-color: #121212; border: 1px solid #333;")
+        self.setText("No screenshot available")
+
+    def update_screenshot(self, filename):
+        if os.path.exists(filename):
+            pixmap = QPixmap(filename)
+            # Scale pixmap to fit widget while maintaining aspect ratio
+            scaled_pixmap = pixmap.scaled(
+                self.width(), self.height(),
+                Qt.KeepAspectRatio, Qt.SmoothTransformation
+            )
+            self.setPixmap(scaled_pixmap)
+        else:
+            self.setText("Screenshot not found")
+
+    def resizeEvent(self, event):
+        # If we have a pixmap, rescale it when the widget is resized
+        if hasattr(self, 'pixmap') and self.pixmap():
+            scaled_pixmap = self.pixmap().scaled(
+                self.width(), self.height(),
+                Qt.KeepAspectRatio, Qt.SmoothTransformation
+            )
+            self.setPixmap(scaled_pixmap)
+        super().resizeEvent(event)
+
+
+class OperateThread(QThread):
+    update_signal = pyqtSignal(str)
+    completed_signal = pyqtSignal()
+    error_signal = pyqtSignal(str)
+    screenshot_signal = pyqtSignal(str)
+
+    def __init__(self, model, objective, voice_mode=False, verbose_mode=False):
+        super().__init__()
+        self.model = model
+        self.objective = objective
+        self.voice_mode = voice_mode
+        self.verbose_mode = verbose_mode
+        self.running = True
+
+    def run(self):
+        try:
+            config.verbose = self.verbose_mode
+            config.validation(self.model, self.voice_mode)
+
+            mic = None
+            if self.voice_mode:
+                try:
+                    from whisper_mic import WhisperMic
+                    mic = WhisperMic()
+                    self.update_signal.emit("Voice recognition initialized.")
+                except ImportError:
+                    self.error_signal.emit(
+                        "Voice mode requires 'whisper_mic' module. Install with 'pip install -r requirements-audio.txt'")
+                    return
+
+            system_prompt = get_system_prompt(self.model, self.objective)
+            system_message = {"role": "system", "content": system_prompt}
+            messages = [system_message]
+            loop_count = 0
+            session_id = None
+
+            self.update_signal.emit(f"Starting task: {self.objective}")
+
+            task_completed = False
+            while not task_completed and self.running:
+                if config.verbose:
+                    self.update_signal.emit(f"[Self Operating Computer] loop_count {loop_count}")
+
+                # Capture screenshot for UI
+                screenshots_dir = "screenshots"
+                if not os.path.exists(screenshots_dir):
+                    os.makedirs(screenshots_dir)
+                screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+                capture_screen_with_cursor(screenshot_filename)
+                self.screenshot_signal.emit(screenshot_filename)
+
+                # Get next action from the model
+                operations, session_id = self.run_async(
+                    get_next_action(self.model, messages, self.objective, session_id)
+                )
+
+                # Process the operations and update task_completed accordingly
+                task_completed = operate(operations, session_id, self.model)
+
+                loop_count += 1
+                if loop_count > 10:
+                    task_completed = True
+                    self.update_signal.emit("[Self-Operating Computer] Max loop count reached. Task considered complete.")
+
+            # If the thread was stopped by the user, we can check the running flag:
+            if not self.running:
+                self.update_signal.emit("Task stopped by the user.")
+            else:
+                self.update_signal.emit("Task completed.")
+            self.completed_signal.emit()
+
+        except Exception as e:
+            self.error_signal.emit(f"Thread error: {str(e)}")
+
+    def stop(self):
+        self.running = False
+
+    def run_async(self, coroutine):
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            return loop.run_until_complete(coroutine)
+        finally:
+            loop.close()
+
+
+
+class VoiceRecordingThread(QThread):
+    finished_signal = pyqtSignal(str)
+
+    def __init__(self, mic):
+        super().__init__()
+        self.mic = mic
+
+    def run(self):
+        try:
+            # Call listen() without a stop_flag since it's not supported
+            result = self.mic.listen()
+            self.finished_signal.emit(result)
+        except Exception as e:
+            self.finished_signal.emit(f"Error: {str(e)}")
+
+class MessageWidget(QFrame):
+    """Widget to display a single message in the chat view"""
+
+    def __init__(self, text, is_user=False, parent=None):
+        super().__init__(parent)
+        self.setFrameShape(QFrame.StyledPanel)
+        self.setStyleSheet(
+            "background-color: #c8c8c8; border-radius: 10px; margin: 5px;" if is_user else
+            "background-color: #d0d0d0; border-radius: 10px; margin: 5px;"
+        )
+
+        layout = QVBoxLayout(self)
+
+        # Add a label for the sender
+        sender = QLabel("You:" if is_user else "System:")
+        sender.setStyleSheet("font-weight: bold; color: #333;")
+        layout.addWidget(sender)
+
+        # Add the message text
+        message = QLabel(text)
+        message.setWordWrap(True)
+        message.setTextInteractionFlags(Qt.TextSelectableByMouse)
+        layout.addWidget(message)
+
+        self.setLayout(layout)
+
+
+
+class SOCChatWindow(QMainWindow):
+    """Main chat window for the Self-Operating Computer interface"""
+
+    def __init__(self):
+        super().__init__()
+
+        self.setWindowTitle("Self-Operating Computer")
+        self.setMinimumSize(1000, 700)
+
+        # Initialize mic to None
+        self.mic = None
+        self.operate_thread = None
+
+        self.init_ui()
+
+        # Try to initialize whisper_mic if available
+        try:
+            from whisper_mic import WhisperMic
+            self.mic = WhisperMic()
+            self.record_button.setEnabled(True)
+        except ImportError:
+            self.record_button.setEnabled(False)
+            self.record_button.setToolTip("Install whisper_mic module to use voice")
+
+    def init_ui(self):
+        """Initialize the user interface"""
+        # Create the central widget and main layout
+        central_widget = QWidget()
+        main_layout = QVBoxLayout(central_widget)
+
+        # Settings bar at the top
+        settings_layout = QHBoxLayout()
+
+        # Model selection dropdown
+        model_label = QLabel("Model:")
+        self.model_combo = QComboBox()
+        self.model_combo.addItems(AVAILABLE_MODELS)
+        self.model_combo.setCurrentIndex(
+            AVAILABLE_MODELS.index("gpt-4-with-ocr") if "gpt-4-with-ocr" in AVAILABLE_MODELS else 0)
+
+        # Verbose mode checkbox
+        self.verbose_checkbox = QCheckBox("Verbose Logs")
+
+        # Add widgets to settings layout
+        settings_layout.addWidget(model_label)
+        settings_layout.addWidget(self.model_combo)
+        settings_layout.addWidget(self.verbose_checkbox)
+        settings_layout.addStretch(1)
+
+        # Add settings to main layout
+        main_layout.addLayout(settings_layout)
+
+        # Create a horizontal splitter for screenshot and chat views
+        h_splitter = QSplitter(Qt.Horizontal)
+
+        # Left panel - Screenshot view
+        screenshot_container = QWidget()
+        screenshot_layout = QVBoxLayout(screenshot_container)
+
+        # Screenshot label
+        screenshot_label = QLabel("Screen Preview:")
+        screenshot_layout.addWidget(screenshot_label)
+
+        # Screenshot display
+        self.screenshot_display = ScreenshotDisplay()
+        screenshot_layout.addWidget(self.screenshot_display)
+
+        h_splitter.addWidget(screenshot_container)
+
+        # Right panel - Chat view and log
+        chat_log_splitter = QSplitter(Qt.Vertical)
+
+        # Chat view area (top part of right panel)
+        chat_container = QWidget()
+        chat_layout = QVBoxLayout(chat_container)
+
+        # Create the scrollable chat view
+        self.chat_scroll_area = QScrollArea()
+        self.chat_scroll_area.setWidgetResizable(True)
+        self.chat_content = QWidget()
+        self.chat_content_layout = QVBoxLayout(self.chat_content)
+        self.chat_content_layout.addStretch(1)  # Push messages to the top
+
+        self.chat_scroll_area.setWidget(self.chat_content)
+        chat_layout.addWidget(self.chat_scroll_area)
+
+        # Input area
+        input_layout = QHBoxLayout()
+
+        # Text input field
+        self.text_input = QLineEdit()
+        self.text_input.setPlaceholderText("Type your request here...")
+        self.text_input.returnPressed.connect(self.send_message)
+
+        # Record button
+        self.record_button = RecordButton()
+        self.record_button.pressed.connect(self.start_recording)
+        self.record_button.released.connect(self.stop_recording)
+
+        # Send button
+        self.send_button = QPushButton("Send")
+        self.send_button.clicked.connect(self.send_message)
+
+        # **New Stop button**
+        self.stop_button = QPushButton("Stop")
+        self.stop_button.clicked.connect(self.stop_task)
+
+        # Add widgets to input layout
+        input_layout.addWidget(self.text_input)
+        input_layout.addWidget(self.record_button)
+        input_layout.addWidget(self.send_button)
+        input_layout.addWidget(self.stop_button)  # Add the Stop button
+
+        # Add input area to chat layout
+        chat_layout.addLayout(input_layout)
+
+        # Log view (bottom part of right panel)
+        self.log_view = QTextEdit()
+        self.log_view.setReadOnly(True)
+        self.log_view.setStyleSheet("font-family: Consolas, monospace; background-color: #222; color: #ddd;")
+
+        # Add chat view and log view to the chat_log_splitter
+        chat_log_splitter.addWidget(chat_container)
+        chat_log_splitter.addWidget(self.log_view)
+        chat_log_splitter.setStretchFactor(0, 3)  # Give chat view more space
+        chat_log_splitter.setStretchFactor(1, 2)
+
+        # Add chat_log_splitter to the right side of h_splitter
+        h_splitter.addWidget(chat_log_splitter)
+        h_splitter.setStretchFactor(0, 1)  # Screenshot area
+        h_splitter.setStretchFactor(1, 2)  # Chat + log area
+
+        # Add h_splitter to main layout
+        main_layout.addWidget(h_splitter)
+
+        # Add progress indicator at the bottom (hidden by default)
+        self.progress_bar = QProgressBar()
+        self.progress_bar.setRange(0, 0)  # Indeterminate mode
+        self.progress_bar.setVisible(False)
+        main_layout.addWidget(self.progress_bar)
+
+        # Set the central widget
+        self.setCentralWidget(central_widget)
+
+        # Redirect stdout to the log view
+        self.log_redirector = LogRedirector(self.log_view)
+        sys.stdout = self.log_redirector
+        sys.stderr = self.log_redirector
+
+        # Add a welcome message to the chat
+        self.add_message("Welcome to Self-Operating Computer! What would you like done?", is_user=False)
+
+        # Set focus to the text input
+        self.text_input.setFocus()
+
+        # Check for screenshots directory and display the latest screenshot if available
+        screenshots_dir = "screenshots"
+        if os.path.exists(screenshots_dir):
+            screenshot_files = [f for f in os.listdir(screenshots_dir) if f.endswith('.png')]
+            if screenshot_files:
+                latest_screenshot = os.path.join(screenshots_dir, sorted(screenshot_files)[-1])
+                self.screenshot_display.update_screenshot(latest_screenshot)
+
+    def add_message(self, text, is_user=True):
+        """Add a message to the chat view"""
+        message_widget = MessageWidget(text, is_user)
+        self.chat_content_layout.insertWidget(self.chat_content_layout.count() - 1, message_widget)
+
+        # Scroll to the bottom to show the new message
+        self.chat_scroll_area.verticalScrollBar().setValue(
+            self.chat_scroll_area.verticalScrollBar().maximum()
+        )
+
+    def send_message(self):
+        """Send a message and start processing the task"""
+        text = self.text_input.text().strip()
+        if not text:
+            return
+
+        # Add the message to the chat view
+        self.add_message(text, is_user=True)
+        self.text_input.clear()
+
+        # Start processing in a separate thread
+        self.process_task(text)
+
+    def process_task(self, objective):
+        """Process a task in a separate thread"""
+        # Disable input while processing
+        self.text_input.setEnabled(False)
+        self.send_button.setEnabled(False)
+        self.record_button.setEnabled(False)
+        self.model_combo.setEnabled(False)
+        self.verbose_checkbox.setEnabled(False)
+
+        # Show progress indicator
+        self.progress_bar.setVisible(True)
+
+        # Get selected model and verbose setting
+        model = self.model_combo.currentText()
+        verbose = self.verbose_checkbox.isChecked()
+
+        # Create and start the thread
+        self.operate_thread = OperateThread(model, objective, False, verbose)
+        self.operate_thread.update_signal.connect(self.update_log)
+        self.operate_thread.completed_signal.connect(self.task_completed)
+        self.operate_thread.error_signal.connect(self.handle_error)
+        self.operate_thread.screenshot_signal.connect(self.update_screenshot)
+        self.operate_thread.start()
+
+    @pyqtSlot()
+    def stop_task(self):
+        if self.operate_thread is not None and self.operate_thread.isRunning():
+            self.operate_thread.stop()  # Signal the thread to stop
+            self.operate_thread.wait()  # Wait for it to finish
+            self.add_message("Task stopped by the user.", is_user=False)
+
+            # Re-enable input and hide progress indicator
+            self.text_input.setEnabled(True)
+            self.send_button.setEnabled(True)
+            self.record_button.setEnabled(True)
+            self.model_combo.setEnabled(True)
+            self.verbose_checkbox.setEnabled(True)
+            self.progress_bar.setVisible(False)
+            self.text_input.setFocus()
+
+    @pyqtSlot(str)
+    def update_log(self, text):
+        """Update the log view with new text"""
+        print(text)
+
+    @pyqtSlot(str)
+    def update_screenshot(self, filename):
+        """Update the screenshot display with the latest screenshot"""
+        self.screenshot_display.update_screenshot(filename)
+
+    @pyqtSlot()
+    def task_completed(self):
+        """Handle task completion"""
+        # Add completion message to chat
+        self.add_message("Task completed! What would you like to do next?", is_user=False)
+
+        # Re-enable input
+        self.text_input.setEnabled(True)
+        self.send_button.setEnabled(True)
+        self.model_combo.setEnabled(True)
+        self.verbose_checkbox.setEnabled(True)
+        if self.mic:
+            self.record_button.setEnabled(True)
+
+        # Hide progress indicator
+        self.progress_bar.setVisible(False)
+
+        # Set focus back to text input
+        self.text_input.setFocus()
+
+    @pyqtSlot(str)
+    def handle_error(self, error_text):
+        """Handle errors from the operate thread"""
+        print(f"ERROR: {error_text}")
+        self.add_message(f"An error occurred: {error_text}", is_user=False)
+
+        # Re-enable input
+        self.text_input.setEnabled(True)
+        self.send_button.setEnabled(True)
+        self.model_combo.setEnabled(True)
+        self.verbose_checkbox.setEnabled(True)
+        if self.mic:
+            self.record_button.setEnabled(True)
+
+        # Hide progress indicator
+        self.progress_bar.setVisible(False)
+
+        # Set focus back to text input
+        self.text_input.setFocus()
+
+    def start_recording(self):
+        """Start voice recording"""
+        if not self.mic:
+            return
+
+        self.record_thread = VoiceRecordingThread(self.mic)
+        self.record_thread.finished_signal.connect(self.process_voice_result)
+        self.record_thread.start()
+
+    def stop_recording(self):
+        """Stop voice recording gracefully."""
+        if hasattr(self, 'record_thread') and self.record_thread.isRunning():
+            self.record_thread.stop()  # signal the thread to stop
+            self.record_thread.wait(2000)  # wait up to 2 seconds for the thread to finish
+
+    @pyqtSlot(str)
+    def process_voice_result(self, result):
+        """Process the result from voice recognition"""
+        if result.startswith("Error:"):
+            QMessageBox.warning(self, "Voice Recognition Error", result)
+            return
+
+        # Set the recognized text to the input field and send it
+        self.text_input.setText(result)
+        self.send_message()
+
+    def closeEvent(self, event):
+        """Handle window close event"""
+        # Stop any running thread
+        if self.operate_thread and self.operate_thread.isRunning():
+            self.operate_thread.stop()
+            self.operate_thread.wait()
+
+        # Restore stdout and stderr
+        sys.stdout = self.log_redirector.original_stdout
+        sys.stderr = self.log_redirector.original_stderr
+
+        event.accept()
\ No newline at end of file
diff --git a/gui_main.py b/gui_main.py
new file mode 100644
index 00000000..f0154db6
--- /dev/null
+++ b/gui_main.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+"""
+Self-Operating Computer GUI
+"""
+import sys
+import os
+import argparse
+from PyQt5.QtWidgets import QApplication
+
+# Add the root directory to the system path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Import after setting path
+from operate.config import Config
+from operate.utils.style import ANSI_BRIGHT_MAGENTA
+from gui import SOCChatWindow
+
+
+def main_entry():
+    """
+    Main entry point for the Self-Operating Computer GUI
+    """
+    parser = argparse.ArgumentParser(
+        description="Run the Self-Operating Computer GUI with a specified model."
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        help="Specify the default model to use",
+        required=False,
+        default="gpt-4-with-ocr",
+    )
+
+    # Add a flag for verbose mode
+    parser.add_argument(
+        "--verbose",
+        help="Run with verbose logging",
+        action="store_true",
+    )
+
+    # Allow for dark or light mode
+    parser.add_argument(
+        "--light",
+        help="Use light mode instead of dark mode",
+        action="store_true",
+    )
+
+    try:
+        args = parser.parse_args()
+
+        # Create Qt application
+        app = QApplication(sys.argv)
+        app.setStyle("Fusion")
+
+        # Apply dark mode palette unless light mode is requested
+        if not args.light:
+            from PyQt5.QtGui import QPalette, QColor
+            from PyQt5.QtCore import Qt
+
+            palette = QPalette()
+            palette.setColor(QPalette.Window, QColor(53, 53, 53))
+            palette.setColor(QPalette.WindowText, Qt.white)
+            palette.setColor(QPalette.Base, QColor(25, 25, 25))
+            palette.setColor(QPalette.AlternateBase, QColor(53, 53, 53))
+            palette.setColor(QPalette.ToolTipBase, Qt.white)
+            palette.setColor(QPalette.ToolTipText, Qt.white)
+            palette.setColor(QPalette.Text, Qt.white)
+            palette.setColor(QPalette.Button, QColor(53, 53, 53))
+            palette.setColor(QPalette.ButtonText, Qt.white)
+            palette.setColor(QPalette.BrightText, Qt.red)
+            palette.setColor(QPalette.Link, QColor(42, 130, 218))
+            palette.setColor(QPalette.Highlight, QColor(42, 130, 218))
+            palette.setColor(QPalette.HighlightedText, Qt.black)
+            app.setPalette(palette)
+
+        # Initialize configuration
+        config = Config()
+        config.verbose = args.verbose
+
+        # Create and show the main window
+        window = SOCChatWindow()
+
+        # Set the default model based on command-line argument
+        model_index = window.model_combo.findText(args.model)
+        if model_index >= 0:
+            window.model_combo.setCurrentIndex(model_index)
+
+        # Set verbose checkbox based on command-line argument
+        window.verbose_checkbox.setChecked(args.verbose)
+
+        # Show the window
+        window.show()
+
+        # Run the application
+        sys.exit(app.exec_())
+
+    except KeyboardInterrupt:
+        print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
+    except Exception as e:
+        print(f"Error starting GUI: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main_entry()
\ No newline at end of file

From 296ee78987b489e0759ab17b2686a62af3ea2b5b Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sun, 2 Mar 2025 21:42:19 +0100
Subject: [PATCH 30/37] Update operate.py

---
 operate/operate.py | 270 ++++++++++++++++++++++++++-------------------
 1 file changed, 156 insertions(+), 114 deletions(-)

diff --git a/operate/operate.py b/operate/operate.py
index 32717a36..18bf4d03 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -5,31 +5,9 @@
 import pyautogui
 from prompt_toolkit.shortcuts import message_dialog
 from prompt_toolkit import prompt
-from operate.exceptions import ModelNotRecognizedException
 import platform
 
 # from operate.models.prompts import USER_QUESTION, get_system_prompt
-from operate.models.prompts import (
-    USER_QUESTION,
-    get_system_prompt,
-)
-from operate.config import Config
-from operate.utils.style import (
-    ANSI_GREEN,
-    ANSI_RESET,
-    ANSI_YELLOW,
-    ANSI_RED,
-    ANSI_BRIGHT_MAGENTA,
-    ANSI_BLUE,
-    style,
-)
-from operate.utils.operating_system import OperatingSystem
-from operate.models.apis import get_next_action
-
-# Load configuration
-config = Config()
-operating_system = OperatingSystem()
-
 
 def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
     """
@@ -43,93 +21,127 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False):
     Returns:
     None
     """
+    from operate.config import Config
+    from operate.exceptions import ModelNotRecognizedException
+    
+    from operate.utils.style import (
+        ANSI_GREEN,
+        ANSI_RESET,
+        ANSI_YELLOW,
+        ANSI_RED,
+        ANSI_BRIGHT_MAGENTA,
+        ANSI_BLUE,
+        style,
+    )
+
+    from operate.utils.operating_system import OperatingSystem
+    from operate.models.prompts import (
+        USER_QUESTION,
+        get_system_prompt,
+    )
+
+    # Load configuration
+    config = Config()
+    operating_system = OperatingSystem()
+    
+    from operate.models.apis import get_next_action
+
+    while True:  # Add outer loop to enable restarting after completion
+        mic = None
+        # Initialize `WhisperMic`, if `voice_mode` is True
+
+        config.verbose = verbose_mode
+        config.validation(model, voice_mode)
+
+        if voice_mode:
+            try:
+                from whisper_mic import WhisperMic
 
-    mic = None
-    # Initialize `WhisperMic`, if `voice_mode` is True
-
-    config.verbose = verbose_mode
-    config.validation(model, voice_mode)
-
-    if voice_mode:
-        try:
-            from whisper_mic import WhisperMic
-
-            # Initialize WhisperMic if import is successful
-            mic = WhisperMic()
-        except ImportError:
+                # Initialize WhisperMic if import is successful
+                mic = WhisperMic()
+            except ImportError:
+                print(
+                    "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'"
+                )
+                sys.exit(1)
+
+        # Skip message dialog if prompt was given directly
+        if not terminal_prompt:
+            message_dialog(
+                title="Self-Operating Computer",
+                text="An experimental framework to enable multimodal models to operate computers",
+                style=style,
+            ).run()
+
+        else:
+            print("Running direct prompt...")
+
+        # # Clear the console
+        if platform.system() == "Windows":
+            os.system("cls")
+        else:
+            print("\033c", end="")
+
+        if terminal_prompt and not hasattr(main, 'first_run_complete'):
+            # Only use the terminal prompt on the first iteration
+            objective = terminal_prompt
+            main.first_run_complete = True
+        elif voice_mode:
             print(
-                "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'"
+                f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
             )
-            sys.exit(1)
-
-    # Skip message dialog if prompt was given directly
-    if not terminal_prompt:
-        message_dialog(
-            title="Self-Operating Computer",
-            text="An experimental framework to enable multimodal models to operate computers",
-            style=style,
-        ).run()
-
-    else:
-        print("Running direct prompt...")
-
-    # # Clear the console
-    if platform.system() == "Windows":
-        os.system("cls")
-    else:
-        print("\033c", end="")
-
-    if terminal_prompt:  # Skip objective prompt if it was given as an argument
-        objective = terminal_prompt
-    elif voice_mode:
-        print(
-            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
-        )
-        try:
-            objective = mic.listen()
-        except Exception as e:
-            print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
-            return  # Exit if voice input fails
-    else:
-        print(
-            f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}"
-        )
-        print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
-        objective = prompt(style=style)
-
-    system_prompt = get_system_prompt(model, objective)
-    system_message = {"role": "system", "content": system_prompt}
-    messages = [system_message]
-
-    loop_count = 0
-
-    session_id = None
-
-    while True:
-        if config.verbose:
-            print("[Self Operating Computer] loop_count", loop_count)
-        try:
-            operations, session_id = asyncio.run(
-                get_next_action(model, messages, objective, session_id)
+            try:
+                objective = mic.listen()
+            except Exception as e:
+                print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}")
+                return  # Exit if voice input fails
+        else:
+            print(
+                f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}"
             )
+            print(f"{ANSI_YELLOW}[User]{ANSI_RESET}")
+            objective = prompt(style=style)
 
-            stop = operate(operations, session_id, model)
-            if stop:
-                break
+        system_prompt = get_system_prompt(model, objective)
+        system_message = {"role": "system", "content": system_prompt}
+        messages = [system_message]
 
-            loop_count += 1
-            if loop_count > 10:
-                break
-        except ModelNotRecognizedException as e:
-            print(
-                f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
-            )
-            break
-        except Exception as e:
-            print(
-                f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
-            )
-            break
+        loop_count = 0
+
+        session_id = None
+
+        task_completed = False  # Flag to indicate if the task was completed
+        while not task_completed:
+            if config.verbose:
+                print("[Self Operating Computer] loop_count", loop_count)
+            try:
+                operations, session_id = asyncio.run(
+                    get_next_action(model, messages, objective, session_id)
+                )
+
+                # Instead of breaking out of the whole program, we set a flag if "done" is reached
+                task_completed = operate(operations, session_id, model)
+
+                loop_count += 1
+                if loop_count > 10:
+                    task_completed = True  # Force completion if loop count exceeds 10
+                    print(
+                        f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Max loop count reached. Moving to next task.{ANSI_RESET}")
+            except ModelNotRecognizedException as e:
+                print(
+                    f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
+                )
+                task_completed = True  # Exit inner loop and start over
+            except Exception as e:
+                print(
+                    f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
+                )
+                task_completed = True  # Exit inner loop and start over
+
+        print(f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Task completed. Ready for a new task.")
+        if terminal_prompt:
+            # If the session was started with a terminal prompt, we need to clear it after the first use
+            terminal_prompt = None
 
 
 # def verify_click_target(x_percent, y_percent, target_description, client):
@@ -455,15 +467,18 @@ def click_relative(x_percent, y_percent, x_divisor=1.50, y_multiplier=1.25):
 def operate(operations, session_id, model=None):
     """
     Processes a list of operations and executes them.
-    Supports click, doubleclick, write, press, wait, and done operations.
-    For click operations, it uses the adjusted coordinate conversion:
+    Supports click, doubleclick, rightclick, scroll, write, press, wait, and done operations.
+    For click/doubleclick/rightclick operations, it uses the adjusted coordinate conversion:
     - x-coordinate divided by 1.50.
     - y-coordinate multiplied by 1.25.
+
+    Returns:
+        bool: True if "done" operation was encountered (task completed), otherwise False
     """
     import time
 
     for op in operations:
-        if op.get("operation") in ["click", "doubleclick"]:
+        if op.get("operation") in ["click", "doubleclick", "rightclick"]:
             try:
                 x_percent = float(op.get("x", 0))
                 y_percent = float(op.get("y", 0))
@@ -478,20 +493,47 @@ def operate(operations, session_id, model=None):
                 adjusted_x = int(base_x / 1.50)
                 adjusted_y = int(base_y * 1.25)
 
+                operation_type = op.get("operation")
+                operation_name = {
+                    "click": "Clicking",
+                    "doubleclick": "Double-clicking",
+                    "rightclick": "Right-clicking"
+                }.get(operation_type, operation_type)
+
                 print(
-                    f"{'Double-clicking' if op.get('operation') == 'doubleclick' else 'Clicking'} "
-                    f"at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen "
+                    f"{operation_name} at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen "
                     f"with scaling factor {scaling_factor}"
                 )
 
-                if op.get("operation") == "doubleclick":
+                if operation_type == "doubleclick":
                     pyautogui.doubleClick(adjusted_x, adjusted_y)
+                elif operation_type == "rightclick":
+                    pyautogui.rightClick(adjusted_x, adjusted_y)
                 else:
                     pyautogui.click(adjusted_x, adjusted_y)
             except Exception as e:
-                print(
-                    f"Error performing {'double-click' if op.get('operation') == 'doubleclick' else 'click'} operation:",
-                    e)
+                print(f"Error performing {op.get('operation')} operation:", e)
+
+        elif op.get("operation") == "scroll":
+            try:
+                direction = op.get("direction", "down")
+                amount = int(op.get("amount", 3))
+
+                # Convert direction to clicks (positive for down/right, negative for up/left)
+                clicks = amount
+                if direction in ["up", "left"]:
+                    clicks = -amount
+
+                if direction in ["up", "down"]:
+                    print(f"Scrolling {direction} by {amount} clicks")
+                    pyautogui.scroll(clicks)
+                elif direction in ["left", "right"]:
+                    print(f"Scrolling {direction} by {amount} clicks")
+                    pyautogui.hscroll(clicks)
+                else:
+                    print(f"Invalid scroll direction: {direction}")
+            except Exception as e:
+                print(f"Error performing scroll operation:", e)
 
         elif op.get("operation") == "write":
             content = op.get("content", "")
@@ -508,6 +550,6 @@ def operate(operations, session_id, model=None):
 
         elif op.get("operation") == "done":
             print("Operation completed:", op.get("summary", ""))
-            return True  # Stop processing further operations
+            return True  # Signal that the task is completed
 
-    return False  # Continue processing
+    return False  # Continue processing this task

From 87240336f182e357fe1879b1330908560778a445 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sun, 2 Mar 2025 21:43:02 +0100
Subject: [PATCH 31/37] Update prompts.py

---
 operate/models/prompts.py | 121 ++++++++++++++++++++++++++++++--------
 1 file changed, 96 insertions(+), 25 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index ad660686..197ab143 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -13,7 +13,7 @@
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click
 ```
@@ -25,22 +25,32 @@
 [{{ "thought": "write a thought here", "operation": "doubleclick", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
 ```
 
-3. write - Write with your keyboard
+3. rightclick - Move mouse and right click
+```
+[{{ "thought": "write a thought here", "operation": "rightclick", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}]  # "percent" refers to the percentage of the screen's dimensions in decimal format
+```
+
+4. scroll - Scroll the page up, down, left, or right
+```
+[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}]
+```
+
+5. write - Write with your keyboard
 ```
 [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
 ```
 
-4. press - Use a hotkey or press key to operate the computer
+6. press - Use a hotkey or press key to operate the computer
 ```
 [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
 ```
 
-5. done - The objective is completed
+7. done - The objective is completed
 ```
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
 
-6. wait - Wait some time for a page to load
+8. wait - Wait some time for a page to load
 ```
 [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
@@ -65,7 +75,21 @@
 ]
 ```
 
-Example 3: Focuses on the address bar in a browser before typing a website
+Example 3: Right-clicking to open a context menu
+```
+[
+    {{ "thought": "I want to open the context menu to see available options", "operation": "rightclick", "x": "0.50", "y": "0.60" }}
+]
+```
+
+Example 4: Scrolling down a webpage
+```
+[
+    {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "down", "amount": "5" }}
+]
+```
+
+Example 5: Focuses on the address bar in a browser before typing a website
 ```
 [
     {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
@@ -74,7 +98,7 @@
 ]
 ```
 
-Example 4: Waits to the page to load before proceeding to interact
+Example 6: Waits for the page to load before proceeding to interact
 ```
 [
     {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5"}},
@@ -96,7 +120,7 @@
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
 ```
@@ -108,22 +132,32 @@
 [{{ "thought": "write a thought here", "operation": "doubleclick", "label": "~x" }}]
 ```
 
-3. write - Write with your keyboard
+3. rightclick - Move mouse and right click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x`
+```
+[{{ "thought": "write a thought here", "operation": "rightclick", "label": "~x" }}]
+```
+
+4. scroll - Scroll the page up, down, left, or right
+```
+[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}]
+```
+
+5. write - Write with your keyboard
 ```
 [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
 ```
 
-4. press - Use a hotkey or press key to operate the computer
+6. press - Use a hotkey or press key to operate the computer
 ```
 [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
 ```
 
-5. done - The objective is completed
+7. done - The objective is completed
 ```
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
 
-6. wait - Wait some time for a page to load
+8. wait - Wait some time for a page to load
 ```
 [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
@@ -147,7 +181,21 @@
 ]
 ```
 
-Example 3: Focuses on the address bar in a browser before typing a website
+Example 3: Right-clicking to open a context menu with a labeled element
+```
+[
+    {{ "thought": "I want to open the context menu for this element to see available options", "operation": "rightclick", "label": "~42" }}
+]
+```
+
+Example 4: Scrolling down a webpage
+```
+[
+    {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "down", "amount": "5" }}
+]
+```
+
+Example 5: Focuses on the address bar in a browser before typing a website
 ```
 [
     {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }},
@@ -156,7 +204,7 @@
 ]
 ```
 
-Example 4: Send a "Hello World" message in the chat
+Example 6: Send a "Hello World" message in the chat
 ```
 [
     {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }},
@@ -164,7 +212,7 @@
 ]
 ```
 
-Example 5: Waits to the page to load before proceeding to interact
+Example 7: Waits to the page to load before proceeding to interact
 ```
 [
     {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }},
@@ -186,7 +234,7 @@
 
 From looking at the screen, the objective, and your previous actions, take the next best series of action. 
 
-You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
+You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement.
 
 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method.
 ```
@@ -198,22 +246,32 @@
 [{{ "thought": "write a thought here", "operation": "doubleclick", "text": "The text in the item to double click" }}]  
 ```
 
-3. write - Write with your keyboard
+3. rightclick - Move mouse and right click - Look for text to right click
+```
+[{{ "thought": "write a thought here", "operation": "rightclick", "text": "The text in the item to right click" }}]  
+```
+
+4. scroll - Scroll the page up, down, left, or right
+```
+[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}]
+```
+
+5. write - Write with your keyboard
 ```
 [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}]
 ```
 
-4. press - Use a hotkey or press key to operate the computer
+6. press - Use a hotkey or press key to operate the computer
 ```
 [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}]
 ```
 
-5. done - The objective is completed
+7. done - The objective is completed
 ```
 [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}]
 ```
 
-6. wait - Wait some time for a page to load
+8. wait - Wait some time for a page to load
 ```
 [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}]
 ```
@@ -247,7 +305,21 @@
 ]
 ```
 
-Example 4: Search for someone on Linkedin when already on linkedin.com
+Example 4: Right-clicking to open a context menu
+```
+[
+    {{ "thought": "I want to open the context menu to see available options for this item", "operation": "rightclick", "text": "my_document.txt" }}
+]
+```
+
+Example 5: Scrolling through content
+```
+[
+    {{ "thought": "I need to scroll down to see more content on the page", "operation": "scroll", "direction": "down", "amount": "5" }}
+]
+```
+
+Example 6: Search for someone on Linkedin when already on linkedin.com
 ```
 [
     {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }},
@@ -256,7 +328,7 @@
 ]
 ```
 
-Example 5: Waits to the page to load before proceeding to interact
+Example 7: Waits to the page to load before proceeding to interact
 ```
 [
     {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }},
@@ -276,17 +348,16 @@
 """
 
 OPERATE_FIRST_MESSAGE_PROMPT = """
-Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 5 operations available: click, write, press, done, wait
+Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 8 operations available: click, doubleclick, rightclick, scroll, write, press, done, wait
 
 You just started so you are in the terminal app and your code is running in this terminal tab. To leave the terminal, search for a new program on the OS. 
 
 Action:"""
 
 OPERATE_PROMPT = """
-Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 5 operations available: click, write, press, done, wait
+Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 8 operations available: click, doubleclick, rightclick, scroll, write, press, done, wait
 Action:"""
 
-
 def get_system_prompt(model, objective):
     """
     Format the vision prompt more efficiently and print the name of the prompt used

From bc980c2398b723266f57e800c17e964ae547010e Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Sun, 2 Mar 2025 21:45:53 +0100
Subject: [PATCH 32/37] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9e53992b..108bafa6 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@ ome
 <h1 align="center">Self-Operating Computer Framework</h1>
 
 <p align="center">
-  <strong>A framework to enable multimodal models to operate a computer.</strong>
+  <strong>A framework to enable multimodal models to operate a computer GUI INCLUDED and double click, right click, scroll and wait operations defined.</strong>
 </p>
 <p align="center">
   Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. Released Nov 2023, the Self-Operating Computer Framework was one of the first examples of using a multimodal model to view the screen and operate a computer.

From 963b866f8fd14215c8461f5ce18a4612cfc4118d Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Mon, 3 Mar 2025 21:07:04 +0100
Subject: [PATCH 33/37] Update operate.py

more scroll integer number so it scrolls faster
---
 operate/operate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/operate/operate.py b/operate/operate.py
index 18bf4d03..bbd7fb5b 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -517,7 +517,7 @@ def operate(operations, session_id, model=None):
         elif op.get("operation") == "scroll":
             try:
                 direction = op.get("direction", "down")
-                amount = int(op.get("amount", 3))
+                amount = int(op.get("amount", 25))
 
                 # Convert direction to clicks (positive for down/right, negative for up/left)
                 clicks = amount

From e1d92bbbfbde87c8d318f89510b21658487310bb Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Mon, 3 Mar 2025 21:24:15 +0100
Subject: [PATCH 34/37] Update operate.py

now the scroll scrolls faster
---
 operate/operate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/operate/operate.py b/operate/operate.py
index bbd7fb5b..6dddb2ba 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -517,10 +517,10 @@ def operate(operations, session_id, model=None):
         elif op.get("operation") == "scroll":
             try:
                 direction = op.get("direction", "down")
-                amount = int(op.get("amount", 25))
+                amount = int(op.get("amount", 3))
 
                 # Convert direction to clicks (positive for down/right, negative for up/left)
-                clicks = amount
+                clicks = amount * 15
                 if direction in ["up", "left"]:
                     clicks = -amount
 

From fa9f9f5b5b7326783648eabd52ccd4fcfe639254 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Mon, 3 Mar 2025 21:49:50 +0100
Subject: [PATCH 35/37] Update operate.py

had to redo some lines for the scroll to work well
---
 operate/operate.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/operate/operate.py b/operate/operate.py
index 6dddb2ba..f279ab93 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -516,20 +516,26 @@ def operate(operations, session_id, model=None):
 
         elif op.get("operation") == "scroll":
             try:
-                direction = op.get("direction", "down")
+
+                direction = op.get("direction")
                 amount = int(op.get("amount", 3))
 
                 # Convert direction to clicks (positive for down/right, negative for up/left)
-                clicks = amount * 15
+
+                if direction in ["down", "right"]:
+                    clicks = amount * 150
+
                 if direction in ["up", "left"]:
-                    clicks = -amount
+                    clicks = -amount * 150
 
                 if direction in ["up", "down"]:
                     print(f"Scrolling {direction} by {amount} clicks")
                     pyautogui.scroll(clicks)
+
                 elif direction in ["left", "right"]:
                     print(f"Scrolling {direction} by {amount} clicks")
                     pyautogui.hscroll(clicks)
+
                 else:
                     print(f"Invalid scroll direction: {direction}")
             except Exception as e:

From 0ce1ff3844c4d72d02c45e0fc692a324bd68256c Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Mon, 3 Mar 2025 22:05:47 +0100
Subject: [PATCH 36/37] Update operate.py

sorry, better now
---
 operate/operate.py | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/operate/operate.py b/operate/operate.py
index f279ab93..fb84c3e1 100644
--- a/operate/operate.py
+++ b/operate/operate.py
@@ -514,32 +514,58 @@ def operate(operations, session_id, model=None):
             except Exception as e:
                 print(f"Error performing {op.get('operation')} operation:", e)
 
+
         elif op.get("operation") == "scroll":
+
             try:
 
-                direction = op.get("direction")
-                amount = int(op.get("amount", 3))
+                direction = op.get("direction", "")
+
+                amount = int(op.get("amount", 0))
+
+                # For vertical scrolling: positive for up, negative for down
 
-                # Convert direction to clicks (positive for down/right, negative for up/left)
+                if direction == "up":
 
-                if direction in ["down", "right"]:
                     clicks = amount * 150
 
-                if direction in ["up", "left"]:
+                elif direction == "down":
+
                     clicks = -amount * 150
 
+                # For horizontal scrolling: negative for left, positive for right
+
+                elif direction == "left":
+
+                    clicks = -amount * 150
+
+                elif direction == "right":
+
+                    clicks = amount * 150
+
+                else:
+
+                    print(f"Invalid scroll direction: {direction}")
+
+                    clicks = 0
+
+                # Execute scroll based on direction type
+
                 if direction in ["up", "down"]:
+
                     print(f"Scrolling {direction} by {amount} clicks")
+
                     pyautogui.scroll(clicks)
 
                 elif direction in ["left", "right"]:
+
                     print(f"Scrolling {direction} by {amount} clicks")
+
                     pyautogui.hscroll(clicks)
 
-                else:
-                    print(f"Invalid scroll direction: {direction}")
             except Exception as e:
-                print(f"Error performing scroll operation:", e)
+
+                print("Error performing scroll operation:", e)
 
         elif op.get("operation") == "write":
             content = op.get("content", "")

From 3a4d0d9ecdfe3c50b9008398ba87ef391fabe794 Mon Sep 17 00:00:00 2001
From: Koolkatze <120437451+Koolkatze@users.noreply.github.com>
Date: Mon, 3 Mar 2025 22:10:05 +0100
Subject: [PATCH 37/37] Update prompts.py

changes in scroll prompt to suite better our needs
---
 operate/models/prompts.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/operate/models/prompts.py b/operate/models/prompts.py
index 197ab143..c3b4e2ad 100644
--- a/operate/models/prompts.py
+++ b/operate/models/prompts.py
@@ -85,7 +85,7 @@
 Example 4: Scrolling down a webpage
 ```
 [
-    {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "down", "amount": "5" }}
+    {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }}
 ]
 ```
 
@@ -191,7 +191,7 @@
 Example 4: Scrolling down a webpage
 ```
 [
-    {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "down", "amount": "5" }}
+    {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }}
 ]
 ```
 
@@ -315,7 +315,7 @@
 Example 5: Scrolling through content
 ```
 [
-    {{ "thought": "I need to scroll down to see more content on the page", "operation": "scroll", "direction": "down", "amount": "5" }}
+    {{ "thought": "I need to scroll down to see more content on the page", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }}
 ]
 ```