From 89c5b368c840dc5fb4160139bbfb5bee26b6b348 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 8 Feb 2025 21:46:40 +0100 Subject: [PATCH 01/37] Update operate.py Added wait function --- operate/operate.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index c63d9851..e70aa605 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -137,7 +137,7 @@ def operate(operations, model): for operation in operations: if config.verbose: print("[Self Operating Computer][operate] operation", operation) - # wait one second + # wait one second before processing each operation time.sleep(1) operate_type = operation.get("operation").lower() operate_thought = operation.get("thought") @@ -158,17 +158,19 @@ def operate(operations, model): y = operation.get("y") click_detail = {"x": x, "y": y} operate_detail = click_detail - operating_system.mouse(click_detail) elif operate_type == "done": summary = operation.get("summary") - print( f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]" ) print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n") return True - + elif operate_type == "wait" or operate_type == "none": + print( + f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BLUE} Waiting for 5 seconds...{ANSI_RESET}]" + ) + time.sleep(5) else: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" From 1760d989ac220109de8c26546ee46c35d23eeb33 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sun, 9 Feb 2025 12:21:37 +0100 Subject: [PATCH 02/37] Update operate.py I uploaded a functional code to wait if the screen isnt yet loaded and it works perfectly. --- operate/operate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index e70aa605..b052301a 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -167,10 +167,12 @@ def operate(operations, model): print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n") return True elif operate_type == "wait" or operate_type == "none": + duration = operation.get("duration", 5) # Default to 5 seconds if not specified print( - f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BLUE} Waiting for 5 seconds...{ANSI_RESET}]" + f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BLUE} Waiting for {duration} seconds...{ANSI_RESET}]" ) - time.sleep(5) + time.sleep(duration) + operate_detail = f"waiting {duration}s" else: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" From bd294ad2df80c30e8e996a34e444fac34c6318de Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Fri, 28 Feb 2025 23:39:33 +0100 Subject: [PATCH 03/37] Update prompts.py Added wait operation to the prompt so if the page isn't loaded yet it uses operate.py "wait" operation and Waits for 5 seconds. --- operate/models/prompts.py | 44 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 0a7e0ad1..14efbad0 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -13,7 +13,7 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click ``` @@ -34,6 +34,10 @@ ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` +5. wait - Wait some time for a page to load +``` +[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}] +``` Return the actions in array format `[]`. You can take just one action or multiple actions. @@ -57,6 +61,14 @@ ] ``` +Example 3: Waits to the page to load before proceeding to interact +``` +[ + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format +] +``` + A few important notes: - Go to Google Docs and Google Sheets by typing in the Chrome Address bar @@ -71,7 +83,7 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` ``` @@ -90,6 +102,12 @@ ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` + +5. wait - Wait some time for a page to load +``` +[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}] +``` + Return the actions in array format `[]`. You can take just one action or multiple actions. Here a helpful example: @@ -119,6 +137,14 @@ ] ``` +Example 4: Waits to the page to load before proceeding to interact +``` +[ + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format +] +``` + A few important notes: - Go to Google Docs and Google Sheets by typing in the Chrome Address bar @@ -134,7 +160,7 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 4 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method. ``` @@ -153,6 +179,11 @@ [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` +5. wait - Wait some time for a page to load +``` +[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}] +``` + Return the actions in array format `[]`. You can take just one action or multiple actions. Here a helpful example: @@ -183,6 +214,13 @@ {{ "thought": "Finally I'll submit the search form with enter", "operation": "press", "keys": ["enter"] }} ] ``` +Example 4: Waits to the page to load before proceeding to interact +``` +[ + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format +] +``` A few important notes: From 803cdc83dafd5af732f9400246b85246996e6d11 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Fri, 28 Feb 2025 23:54:42 +0100 Subject: [PATCH 04/37] Update prompts.py Corrected and finished adding wait operation to prompt. --- operate/models/prompts.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 14efbad0..8c47788e 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -36,7 +36,7 @@ ``` 5. wait - Wait some time for a page to load ``` -[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}] +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}] ``` Return the actions in array format `[]`. You can take just one action or multiple actions. @@ -64,7 +64,7 @@ Example 3: Waits to the page to load before proceeding to interact ``` [ - {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }}, + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "waiting 5 seconds"}}, {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format ] ``` @@ -105,7 +105,7 @@ 5. wait - Wait some time for a page to load ``` -[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}] +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}] ``` Return the actions in array format `[]`. You can take just one action or multiple actions. @@ -140,7 +140,7 @@ Example 4: Waits to the page to load before proceeding to interact ``` [ - {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }}, + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "Waiting 5 seconds" }}, {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format ] ``` @@ -181,7 +181,7 @@ 5. wait - Wait some time for a page to load ``` -[{{ "thought": "write a thought here", "operation": "wait", "duration": ["seconds to wait (e.g. 5 seconds)"] }}] +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}] ``` Return the actions in array format `[]`. You can take just one action or multiple actions. @@ -217,7 +217,7 @@ Example 4: Waits to the page to load before proceeding to interact ``` [ - {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "seconds": [5] }}, + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "Waiting 5 seconds" }}, {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format ] ``` @@ -234,14 +234,14 @@ """ OPERATE_FIRST_MESSAGE_PROMPT = """ -Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done +Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 5 operations available: click, write, press, done, wait You just started so you are in the terminal app and your code is running in this terminal tab. To leave the terminal, search for a new program on the OS. Action:""" OPERATE_PROMPT = """ -Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 4 operations available: click, write, press, done +Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 5 operations available: click, write, press, done, wait Action:""" From 8ac908d35725a367db39a5abb951f79a7299629e Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 00:24:13 +0100 Subject: [PATCH 05/37] Update prompts.py Made the prompt more coherent --- operate/models/prompts.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 8c47788e..eab7ebdc 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -36,7 +36,7 @@ ``` 5. wait - Wait some time for a page to load ``` -[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}] +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` Return the actions in array format `[]`. You can take just one action or multiple actions. @@ -64,8 +64,8 @@ Example 3: Waits to the page to load before proceeding to interact ``` [ - {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "waiting 5 seconds"}}, - {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5"}}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}] ] ``` @@ -105,7 +105,7 @@ 5. wait - Wait some time for a page to load ``` -[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}] +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` Return the actions in array format `[]`. You can take just one action or multiple actions. @@ -140,8 +140,8 @@ Example 4: Waits to the page to load before proceeding to interact ``` [ - {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "Waiting 5 seconds" }}, - {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}] ] ``` @@ -181,7 +181,7 @@ 5. wait - Wait some time for a page to load ``` -[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5 seconds)" }}] +[{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` Return the actions in array format `[]`. You can take just one action or multiple actions. @@ -217,8 +217,8 @@ Example 4: Waits to the page to load before proceeding to interact ``` [ - {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "Waiting 5 seconds" }}, - {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format + {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }}, + {{ "thought": "Now that the page is loaded and the button to click is in focus I will click the button", "operation": "click", "x": "0.10", "y": "0.13" }}] ] ``` From 983288501da32387c7b8d23b6f1458c31847007d Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 09:29:51 +0100 Subject: [PATCH 06/37] Update README.md new model (Claude 3.7) explanation --- README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ab24691c..cb92f29f 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ ome ## Key Features - **Compatibility**: Designed for various multimodal models. -- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3 and LLaVa.** +- **Integration**: Currently integrated with **GPT-4o, o1, Claude 3.7, Gemini Pro Vision, Claude 3, qwuen-VL and LLaVa.** - **Future Plans**: Support for additional models. ## Demo @@ -62,6 +62,14 @@ operate -m o1-with-ocr ### Multimodal Models `-m` + +#### Try claude 3.7 `-m claude-3.7` +Use Clude 3.7 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Antheopic dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. + +``` +operate -m claude-3.7 +``` + Try Google's `gemini-pro-vision` by following the instructions below. Start `operate` with the Gemini model ``` operate -m gemini-pro-vision @@ -76,6 +84,13 @@ Use Claude 3 with Vision to see how it stacks up to GPT-4-Vision at operating a operate -m claude-3 ``` +#### Try qwen `-m qwen-vl` +Use Qwen-vl with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Qwen dashboard](https://bailian.console.aliyun.com/) to get an API key and run the command below to try it. + +``` +operate -m qwen-vl +``` + #### Try LLaVa Hosted Through Ollama `-m llava` If you wish to experiment with the Self-Operating Computer Framework using LLaVA on your own machine, you can with Ollama! *Note: Ollama currently only supports MacOS and Linux. Windows now in Preview* From 64929e238b8d890473a699492fead4dd243a7c3a Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 09:37:53 +0100 Subject: [PATCH 07/37] Update config.py implementing Claude 3.7 and Qwen-VL API KEY request --- operate/config.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/operate/config.py b/operate/config.py index b97b20ac..ca8f16d6 100644 --- a/operate/config.py +++ b/operate/config.py @@ -44,6 +44,10 @@ def __init__(self): None # instance variables are backups in case saving to a `.env` fails ) + self.qwen_api_key = ( + None # instance variables are backups in case saving to a `.env` fails + ) + def initialize_openai(self): if self.verbose: print("[Config][initialize_openai]") @@ -66,6 +70,29 @@ def initialize_openai(self): client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) return client + def initialize_qwen(self): + if self.verbose: + print("[Config][initialize_qwen]") + + if self.qwen_api_key: + if self.verbose: + print("[Config][initialize_qwen] using cached qwen_api_key") + api_key = self.qwen_api_key + else: + if self.verbose: + print( + "[Config][initialize_qwen] no cached qwen_api_key, try to get from env." + ) + api_key = os.getenv("QWEN_API_KEY") + + client = OpenAI( + api_key=api_key, + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + ) + client.api_key = api_key + client.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" + return client + def initialize_google(self): if self.google_api_key: if self.verbose: @@ -119,8 +146,9 @@ def validation(self, model, voice_mode): "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision" ) self.require_api_key( - "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" + "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" or model == "claude-3.7" ) + self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") def require_api_key(self, key_name, key_description, is_required): key_exists = bool(os.environ.get(key_name)) @@ -147,6 +175,8 @@ def prompt_and_save_api_key(self, key_name, key_description): self.google_api_key = key_value elif key_name == "ANTHROPIC_API_KEY": self.anthropic_api_key = key_value + elif key_name == "QWEN_API_KEY": + self.qwen_api_key = key_value self.save_api_key_to_env(key_name, key_value) load_dotenv() # Reload environment variables # Update the instance attribute with the new key From e9fdb3a6756011aa7b8362023e61ed9276ded44b Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 09:58:10 +0100 Subject: [PATCH 08/37] Update config.py few adjustments --- operate/config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/operate/config.py b/operate/config.py index ca8f16d6..c3e40060 100644 --- a/operate/config.py +++ b/operate/config.py @@ -124,8 +124,14 @@ def initialize_ollama(self): def initialize_anthropic(self): if self.anthropic_api_key: + if self.verbose: + print("[Config][initialize_anthropic] using cached anthropic_api_key") api_key = self.anthropic_api_key else: + if self.verbose: + print( + "[Config][initialize_anthropic] no cached google_api_key, try to get from env." + ) api_key = os.getenv("ANTHROPIC_API_KEY") return anthropic.Anthropic(api_key=api_key) From 2d1c6ad9b3eb4692643935bfb184aedb58fd1270 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:07:58 +0100 Subject: [PATCH 09/37] Update apis.py added Claude 3.7 and Qwen-VL call functions --- operate/models/apis.py | 228 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) diff --git a/operate/models/apis.py b/operate/models/apis.py index d0ccb0c4..65c2ab16 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -25,6 +25,7 @@ ) from operate.utils.ocr import get_text_coordinates, get_text_element from operate.utils.screenshot import capture_screen_with_cursor +from operate.utils.screenshot import capture_screen_with_cursor, compress_screenshot from operate.utils.style import ANSI_BRIGHT_MAGENTA, ANSI_GREEN, ANSI_RED, ANSI_RESET # Load configuration @@ -37,6 +38,11 @@ async def get_next_action(model, messages, objective, session_id): print("[Self-Operating Computer][get_next_action] model", model) if model == "gpt-4": return call_gpt_4o(messages), None + if model == "Claude-3.7": + return call_claude_3_7(messages), None + if model == "qwen-vl": + operation = await call_qwen_vl_with_ocr(messages, objective, model) + return operation, None if model == "gpt-4-with-som": operation = await call_gpt_4o_labeled(messages, objective, model) return operation, None @@ -135,6 +141,228 @@ def call_gpt_4o(messages): traceback.print_exc() return call_gpt_4o(messages) +def call_claude_37(messages): + if config.verbose: + print("[call_claude_37]") + time.sleep(1) + + # We'll need to import Anthropic's client library + import anthropic + + try: + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor + capture_screen_with_cursor(screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + # Determine which prompt to use + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + if config.verbose: + print( + "[call_claude_37] user_prompt", + user_prompt, + ) + + # Initialize Anthropic client + # You'll need to configure this in your config module + client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) + + # Convert previous messages to Anthropic format if needed + anthropic_messages = [] + for msg in messages[:-1]: # Skip the last message as we'll handle it specially + if msg["role"] == "system": + # System messages are handled differently in Anthropic API + system_content = msg["content"] + else: + anthropic_messages.append({ + "role": msg["role"], + "content": msg["content"] + }) + + # Create vision message for Claude + # Claude uses a different format for media than OpenAI + vision_message = { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + "data": img_base64 + } + } + ] + } + + # Add the vision message to our anthropic messages + anthropic_messages.append(vision_message) + + # Create the message request + response = client.messages.create( + model="claude-3-7-sonnet-20250219", # Claude 3.7 Sonnet model ID + messages=anthropic_messages, + system=system_content if 'system_content' in locals() else None, + max_tokens=2048, + ) + + # Extract the content from the response + content = response.content[0].text + content = clean_json(content) + + # Create assistant message + assistant_message = {"role": "assistant", "content": content} + + if config.verbose: + print( + "[call_claude_37] content", + content, + ) + + content = json.loads(content) + messages.append(assistant_message) + return content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}", + e, + ) + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", + content if 'content' in locals() else "No content received", + ) + if config.verbose: + traceback.print_exc() + return call_claude_37(messages) + +async def call_qwen_vl_with_ocr(messages, objective, model): + if config.verbose: + print("[call_qwen_vl_with_ocr]") + + # Construct the path to the file within the package + try: + time.sleep(1) + client = config.initialize_qwen() + + confirm_system_prompt(messages, objective, model) + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + + # Call the function to capture the screen with the cursor + raw_screenshot_filename = os.path.join(screenshots_dir, "raw_screenshot.png") + capture_screen_with_cursor(raw_screenshot_filename) + + # Compress screenshot image to make size be smaller + screenshot_filename = os.path.join(screenshots_dir, "screenshot.jpeg") + compress_screenshot(raw_screenshot_filename, screenshot_filename) + + with open(screenshot_filename, "rb") as img_file: + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") + + if len(messages) == 1: + user_prompt = get_user_first_message_prompt() + else: + user_prompt = get_user_prompt() + + vision_message = { + "role": "user", + "content": [ + {"type": "text", + "text": f"{user_prompt}**REMEMBER** Only output json format, do not append any other text."}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, + }, + ], + } + messages.append(vision_message) + + response = client.chat.completions.create( + model="qwen2.5-vl-72b-instruct", + messages=messages, + ) + + content = response.choices[0].message.content + + content = clean_json(content) + + # used later for the messages + content_str = content + + content = json.loads(content) + + processed_content = [] + + for operation in content: + if operation.get("operation") == "click": + text_to_click = operation.get("text") + if config.verbose: + print( + "[call_qwen_vl_with_ocr][click] text_to_click", + text_to_click, + ) + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + result = reader.readtext(screenshot_filename) + + text_element_index = get_text_element( + result, text_to_click, screenshot_filename + ) + coordinates = get_text_coordinates( + result, text_element_index, screenshot_filename + ) + + # add `coordinates`` to `content` + operation["x"] = coordinates["x"] + operation["y"] = coordinates["y"] + + if config.verbose: + print( + "[call_qwen_vl_with_ocr][click] text_element_index", + text_element_index, + ) + print( + "[call_qwen_vl_with_ocr][click] coordinates", + coordinates, + ) + print( + "[call_qwen_vl_with_ocr][click] final operation", + operation, + ) + processed_content.append(operation) + + else: + processed_content.append(operation) + + # wait to append the assistant message so that if the `processed_content` step fails we don't append a message and mess up message history + assistant_message = {"role": "assistant", "content": content_str} + messages.append(assistant_message) + + return processed_content + + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}" + ) + if config.verbose: + print("[Self-Operating Computer][Operate] error", e) + traceback.print_exc() + return gpt_4_fallback(messages, objective, model) + def call_gemini_pro_vision(messages, objective): """ From f1101af0adf63e3abbf8b48d410832a0f374d695 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:12:00 +0100 Subject: [PATCH 10/37] Update prompts.py No need to make changes for Claude 3.7 model Prompt selection but instead implemented Qwen-VL model Prompt selection --- operate/models/prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index eab7ebdc..f8ddad18 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -270,7 +270,7 @@ def get_system_prompt(model, objective): os_search_str=os_search_str, operating_system=operating_system, ) - elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3": + elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl": prompt = SYSTEM_PROMPT_OCR.format( objective=objective, From 4a6744b5d0318e1ba5d16b8fdb6372d6fb9aa18f Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:15:00 +0100 Subject: [PATCH 11/37] Update screenshot.py no need to make changes for Claude 3.7 screenshot function but added compressed screenshot function for Qwen-VL instead --- operate/utils/screenshot.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/operate/utils/screenshot.py b/operate/utils/screenshot.py index 597911ad..23d492f1 100644 --- a/operate/utils/screenshot.py +++ b/operate/utils/screenshot.py @@ -25,3 +25,18 @@ def capture_screen_with_cursor(file_path): subprocess.run(["screencapture", "-C", file_path]) else: print(f"The platform you're using ({user_platform}) is not currently supported") + + +def compress_screenshot(raw_screenshot_filename, screenshot_filename): + with Image.open(raw_screenshot_filename) as img: + # Check if the image has an alpha channel (transparency) + if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info): + # Create a white background image + background = Image.new('RGB', img.size, (255, 255, 255)) + # Paste the image onto the background, using the alpha channel as mask + background.paste(img, mask=img.split()[3]) # 3 is the alpha channel + # Save the result as JPEG + background.save(screenshot_filename, 'JPEG', quality=85) # Adjust quality as needed + else: + # If no alpha channel, simply convert and save + img.convert('RGB').save(screenshot_filename, 'JPEG', quality=85) From 980e4ee09db32d9dc9c7c31e090595b63afccce5 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:20:29 +0100 Subject: [PATCH 12/37] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index dbb2cf18..5c9e7013 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="self-operating-computer", - version="1.5.7", + version="1.5.9", packages=find_packages(), install_requires=required, # Add dependencies here entry_points={ From cdb67afe4ee2b0e9cac718cad938903a9a1559e4 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:32:44 +0100 Subject: [PATCH 13/37] Update config.py intendation correction --- operate/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operate/config.py b/operate/config.py index c3e40060..a02129d5 100644 --- a/operate/config.py +++ b/operate/config.py @@ -133,7 +133,8 @@ def initialize_anthropic(self): "[Config][initialize_anthropic] no cached google_api_key, try to get from env." ) api_key = os.getenv("ANTHROPIC_API_KEY") - return anthropic.Anthropic(api_key=api_key) + + return anthropic.Anthropic(api_key=api_key) def validation(self, model, voice_mode): """ From eac3aeec7ffeae75b3b1d5f0211f3c3d7ce982cf Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:37:01 +0100 Subject: [PATCH 14/37] Update config.py intendation fix --- operate/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operate/config.py b/operate/config.py index a02129d5..985f3390 100644 --- a/operate/config.py +++ b/operate/config.py @@ -128,13 +128,13 @@ def initialize_anthropic(self): print("[Config][initialize_anthropic] using cached anthropic_api_key") api_key = self.anthropic_api_key else: - if self.verbose: + if self.verbose: print( - "[Config][initialize_anthropic] no cached google_api_key, try to get from env." + "[Config][initialize_anthropic] no cached anthropic_api_key, try to get from env." ) api_key = os.getenv("ANTHROPIC_API_KEY") - - return anthropic.Anthropic(api_key=api_key) + + return anthropic.Anthropic(api_key=api_key) def validation(self, model, voice_mode): """ From 24ef0d9ec4710adf16777dc3f1cb57d5133c6ca7 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:46:14 +0100 Subject: [PATCH 15/37] Update config.py --- operate/config.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/operate/config.py b/operate/config.py index 985f3390..bba6b155 100644 --- a/operate/config.py +++ b/operate/config.py @@ -153,9 +153,10 @@ def validation(self, model, voice_mode): "GOOGLE_API_KEY", "Google API key", model == "gemini-pro-vision" ) self.require_api_key( - "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" or model == "claude-3.7" - ) - self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") + "ANTHROPIC_API_KEY", "Anthropic API key", + model == "claude-3" or model == "claude-3-7-sonnet-20250219" + ) + self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") def require_api_key(self, key_name, key_description, is_required): key_exists = bool(os.environ.get(key_name)) From 6277aa65e8b1df2936888bdc00231a4ae1769f36 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:52:54 +0100 Subject: [PATCH 16/37] Update config.py --- operate/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/config.py b/operate/config.py index bba6b155..d06be2d9 100644 --- a/operate/config.py +++ b/operate/config.py @@ -154,7 +154,7 @@ def validation(self, model, voice_mode): ) self.require_api_key( "ANTHROPIC_API_KEY", "Anthropic API key", - model == "claude-3" or model == "claude-3-7-sonnet-20250219" + model == "claude-3" or model == "claude-3.7" ) self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") From 5c09de0d3f4bb5ac079146866079201b05e7adc3 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 10:55:06 +0100 Subject: [PATCH 17/37] Update apis.py added coherence to the code --- operate/models/apis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 65c2ab16..7e2706b8 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -38,8 +38,8 @@ async def get_next_action(model, messages, objective, session_id): print("[Self-Operating Computer][get_next_action] model", model) if model == "gpt-4": return call_gpt_4o(messages), None - if model == "Claude-3.7": - return call_claude_3_7(messages), None + if model == "claude-3.7": + return call_claude_37(messages), None if model == "qwen-vl": operation = await call_qwen_vl_with_ocr(messages, objective, model) return operation, None From 2aa8aa464ac8d21fb990094c631e0cb64705e4b8 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 11:02:58 +0100 Subject: [PATCH 18/37] Update apis.py --- operate/models/apis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 7e2706b8..a95ecc29 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -174,7 +174,7 @@ def call_claude_37(messages): # Initialize Anthropic client # You'll need to configure this in your config module - client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) + client = anthropic.Anthropic(api_key=config.anthropic_api_key if config.anthropic_api_key else os.getenv("ANTHROPIC_API_KEY")) # Convert previous messages to Anthropic format if needed anthropic_messages = [] From 9b23a09be2694c88d821bc5500caf9ff4e77669c Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 11:07:19 +0100 Subject: [PATCH 19/37] Update apis.py --- operate/models/apis.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index a95ecc29..f8cda256 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -173,9 +173,7 @@ def call_claude_37(messages): ) # Initialize Anthropic client - # You'll need to configure this in your config module - client = anthropic.Anthropic(api_key=config.anthropic_api_key if config.anthropic_api_key else os.getenv("ANTHROPIC_API_KEY")) - + client = config.initialize_anthropic() # Convert previous messages to Anthropic format if needed anthropic_messages = [] for msg in messages[:-1]: # Skip the last message as we'll handle it specially From 20fdacedf30fe9724395789165546567ca29273c Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 11:12:34 +0100 Subject: [PATCH 20/37] Update config.py --- operate/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operate/config.py b/operate/config.py index d06be2d9..6d72cb13 100644 --- a/operate/config.py +++ b/operate/config.py @@ -155,8 +155,8 @@ def validation(self, model, voice_mode): self.require_api_key( "ANTHROPIC_API_KEY", "Anthropic API key", model == "claude-3" or model == "claude-3.7" - ) - self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") + ) + self.require_api_key("QWEN_API_KEY", "Qwen API key", model == "qwen-vl") def require_api_key(self, key_name, key_description, is_required): key_exists = bool(os.environ.get(key_name)) From e703cb0145d7588809231783dd9c47d849759282 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 11:34:31 +0100 Subject: [PATCH 21/37] Update operate.py change sting to integer in wait operation seconds --- operate/operate.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index b052301a..02c96e9f 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -166,13 +166,12 @@ def operate(operations, model): ) print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n") return True - elif operate_type == "wait" or operate_type == "none": - duration = operation.get("duration", 5) # Default to 5 seconds if not specified - print( - f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BLUE} Waiting for {duration} seconds...{ANSI_RESET}]" - ) + elif operate_type == "wait": + duration = operation["duration"] + if isinstance(duration, str): + duration = float(duration) # Convert string to float/integer + print(f"{ANSI_GREEN}[Self-Operating Computer | Waiting for {duration} seconds...]{ANSI_RESET}") time.sleep(duration) - operate_detail = f"waiting {duration}s" else: print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" From 34c7e21260e38ee1aef1aac729a929b47c76cfc8 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 11:36:34 +0100 Subject: [PATCH 22/37] Update apis.py changed call_claude_37 function to correctly handle requests --- operate/models/apis.py | 112 +++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 39 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index f8cda256..22afd718 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -141,53 +141,76 @@ def call_gpt_4o(messages): traceback.print_exc() return call_gpt_4o(messages) + def call_claude_37(messages): if config.verbose: print("[call_claude_37]") time.sleep(1) - - # We'll need to import Anthropic's client library + + # Import the anthropic module inside the function to ensure it's available import anthropic - + try: screenshots_dir = "screenshots" if not os.path.exists(screenshots_dir): os.makedirs(screenshots_dir) screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + # Call the function to capture the screen with the cursor capture_screen_with_cursor(screenshot_filename) - - with open(screenshot_filename, "rb") as img_file: + + # Convert PNG to JPEG format to ensure compatibility + img = Image.open(screenshot_filename) + if img.mode in ('RGBA', 'LA'): + # Remove alpha channel for JPEG compatibility + background = Image.new("RGB", img.size, (255, 255, 255)) + background.paste(img, mask=img.split()[3]) # 3 is the alpha channel + img = background + + # Save as JPEG + jpeg_filename = os.path.join(screenshots_dir, "screenshot.jpg") + img.save(jpeg_filename, "JPEG", quality=95) + + with open(jpeg_filename, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()).decode("utf-8") - + # Determine which prompt to use if len(messages) == 1: user_prompt = get_user_first_message_prompt() else: user_prompt = get_user_prompt() - + if config.verbose: - print( - "[call_claude_37] user_prompt", - user_prompt, - ) - - # Initialize Anthropic client - client = config.initialize_anthropic() - # Convert previous messages to Anthropic format if needed + print("[call_claude_37] user_prompt", user_prompt) + + # Initialize Anthropic client directly with the environment variable + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + api_key = config.anthropic_api_key # Fallback to instance variable + + if config.verbose: + print("[call_claude_37] Using Anthropic API key (masked):", "*" * len(api_key) if api_key else "None") + + client = anthropic.Anthropic(api_key=api_key) + + # Extract system message + system_content = None + if messages and messages[0]["role"] == "system": + system_content = messages[0]["content"] + user_messages = messages[1:-1] if len(messages) > 1 else [] # Skip system message and last message + else: + user_messages = messages[:-1] if messages else [] # No system message, include all but last + + # Convert previous messages to Anthropic format anthropic_messages = [] - for msg in messages[:-1]: # Skip the last message as we'll handle it specially - if msg["role"] == "system": - # System messages are handled differently in Anthropic API - system_content = msg["content"] - else: + for msg in user_messages: + if msg["role"] in ["user", "assistant"]: # Only include user and assistant messages anthropic_messages.append({ "role": msg["role"], "content": msg["content"] }) - + # Create vision message for Claude - # Claude uses a different format for media than OpenAI vision_message = { "role": "user", "content": [ @@ -202,47 +225,58 @@ def call_claude_37(messages): } ] } - - # Add the vision message to our anthropic messages + + # Add the vision message anthropic_messages.append(vision_message) - + + if config.verbose: + print("[call_claude_37] System content length:", len(system_content) if system_content else 0) + print("[call_claude_37] Number of messages:", len(anthropic_messages)) + # Create the message request response = client.messages.create( - model="claude-3-7-sonnet-20250219", # Claude 3.7 Sonnet model ID + model="claude-3-7-sonnet-20250219", messages=anthropic_messages, - system=system_content if 'system_content' in locals() else None, + system=system_content, max_tokens=2048, ) - + # Extract the content from the response content = response.content[0].text content = clean_json(content) - + # Create assistant message assistant_message = {"role": "assistant", "content": content} - + if config.verbose: - print( - "[call_claude_37] content", - content, - ) - + print("[call_claude_37] content", content) + content = json.loads(content) messages.append(assistant_message) return content - + except Exception as e: + error_msg = str(e) print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[Operate] That did not work. Trying again {ANSI_RESET}", - e, + error_msg, ) + + # Define content_str before using it to avoid the "referenced before assignment" error + content_str = "No content received" + if 'content' in locals(): + content_str = content + print( f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response was {ANSI_RESET}", - content if 'content' in locals() else "No content received", + content_str, ) + if config.verbose: traceback.print_exc() - return call_claude_37(messages) + + # Fall back to GPT-4o + return call_gpt_4o(messages) async def call_qwen_vl_with_ocr(messages, objective, model): if config.verbose: From 969cd07b2f3c95fd3e1cb607b550bfd23b8d5545 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 17:13:25 +0100 Subject: [PATCH 23/37] Update operate.py added a scaling factor multiplier and divider, added a double click operation, left a pair of ideas behind but commented them in case somebody needs them. --- operate/operate.py | 431 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 377 insertions(+), 54 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index 02c96e9f..32717a36 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -2,6 +2,7 @@ import os import time import asyncio +import pyautogui from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt from operate.exceptions import ModelNotRecognizedException @@ -112,7 +113,7 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): get_next_action(model, messages, objective, session_id) ) - stop = operate(operations, model) + stop = operate(operations, session_id, model) if stop: break @@ -131,60 +132,382 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): break -def operate(operations, model): - if config.verbose: - print("[Self Operating Computer][operate]") - for operation in operations: - if config.verbose: - print("[Self Operating Computer][operate] operation", operation) - # wait one second before processing each operation - time.sleep(1) - operate_type = operation.get("operation").lower() - operate_thought = operation.get("thought") - operate_detail = "" - if config.verbose: - print("[Self Operating Computer][operate] operate_type", operate_type) - - if operate_type == "press" or operate_type == "hotkey": - keys = operation.get("keys") - operate_detail = keys - operating_system.press(keys) - elif operate_type == "write": - content = operation.get("content") - operate_detail = content - operating_system.write(content) - elif operate_type == "click": - x = operation.get("x") - y = operation.get("y") - click_detail = {"x": x, "y": y} - operate_detail = click_detail - operating_system.mouse(click_detail) - elif operate_type == "done": - summary = operation.get("summary") - print( - f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]" +# def verify_click_target(x_percent, y_percent, target_description, client): +# import pyautogui +# import base64 +# import io +# from PIL import Image, ImageDraw +# +# screen_width, screen_height = pyautogui.size() +# x = int(float(x_percent) * screen_width) +# y = int(float(y_percent) * screen_height) +# +# region_size = 100 +# region_left = max(0, x - region_size) +# region_top = max(0, y - region_size) +# region_width = min(region_size * 2, screen_width - region_left) +# region_height = min(region_size * 2, screen_height - region_top) +# +# region_screenshot = pyautogui.screenshot(region=(region_left, region_top, region_width, region_height)) +# +# draw = ImageDraw.Draw(region_screenshot) +# center_x = x - region_left +# center_y = y - region_top +# line_length = 20 +# draw.line((center_x - line_length, center_y, center_x + line_length, center_y), fill='red', width=2) +# draw.line((center_x, center_y - line_length, center_x, center_y + line_length), fill='red', width=2) +# +# buffer = io.BytesIO() +# region_screenshot.save(buffer, format="JPEG") +# img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8') +# +# try: +# verification_prompt = f""" +# I'm about to click at the position marked with the red crosshair. +# I'm trying to click on: "{target_description}" +# +# Does the crosshair appear to be positioned correctly on or very near the target? +# Respond ONLY with "YES" if it's correct or "NO" if it's wrong. +# """ +# +# response = client.messages.create( +# model="claude-3-7-sonnet-20250219", +# messages=[{ +# "role": "user", +# "content": [ +# {"type": "text", "text": verification_prompt}, +# { +# "type": "image", +# "source": { +# "type": "base64", +# "media_type": "image/jpeg", +# "data": img_base64 +# } +# } +# ] +# }], +# max_tokens=50, +# ) +# +# verification_result = response.content[0].text.strip().upper() +# +# print(f"[Click Verification] Target: {target_description}") +# print(f"[Click Verification] Claude's response: {verification_result}") +# +# region_screenshot.save("debug_last_click_verification.jpg") +# +# return "YES" in verification_result +# +# except Exception as e: +# print(f"[Click Verification] Error during verification: {e}") +# return False + + +import cv2 +import numpy as np +import pyautogui +import os +import io +from PIL import Image, ImageDraw + + +def find_icon_on_screen(target_description): + """ + Uses computer vision to find an icon or UI element that matches the target description. + + Args: + target_description (str): Description of what we're trying to find (e.g., "sbc-images-main folder") + + Returns: + tuple: (x_percent, y_percent) coordinates as percentages of screen width/height, or None if not found + """ + # Take a screenshot of the entire screen + screenshot = pyautogui.screenshot() + screenshot_np = np.array(screenshot) + screenshot_rgb = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR) + + # Save the screenshot for debugging + cv2.imwrite("debug_full_screen.jpg", screenshot_rgb) + + # Initialize results + results = [] + + # 1. Text detection for folder/file names (optional, requires pytesseract) + try: + import pytesseract + gray = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2GRAY) + + # Extract text from the screenshot + text_data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT) + + # Look for the target text in detected text + target_words = target_description.lower().split() + + for i, text in enumerate(text_data['text']): + if text and any(word in text.lower() for word in target_words): + # Get coordinates for this text + x = text_data['left'][i] + text_data['width'][i] // 2 + y = text_data['top'][i] + text_data['height'][i] // 2 + + # Add to results with high confidence + results.append((x, y, 0.9)) # 0.9 is confidence score + + # Draw a rectangle around the text for debugging + x1, y1 = text_data['left'][i], text_data['top'][i] + x2 = x1 + text_data['width'][i] + y2 = y1 + text_data['height'][i] + cv2.rectangle(screenshot_rgb, (x1, y1), (x2, y2), (0, 255, 0), 2) + except (ImportError, Exception) as e: + print(f"Text detection not available: {e}") + + # 2. Template matching for common desktop icons + icon_folder = "icon_templates" + if os.path.exists(icon_folder): + for filename in os.listdir(icon_folder): + if filename.endswith(('.png', '.jpg')): + template_path = os.path.join(icon_folder, filename) + template = cv2.imread(template_path) + + if template is None: + continue + + # Apply template matching + template_h, template_w = template.shape[:2] + res = cv2.matchTemplate(screenshot_rgb, template, cv2.TM_CCOEFF_NORMED) + + # Get locations where the match exceeds threshold + threshold = 0.7 + loc = np.where(res >= threshold) + + for pt in zip(*loc[::-1]): + # Get center point of the match + x = pt[0] + template_w // 2 + y = pt[1] + template_h // 2 + confidence = res[pt[1], pt[0]] + + # Add to results + results.append((x, y, confidence)) + + # Draw for debugging + cv2.rectangle(screenshot_rgb, pt, (pt[0] + template_w, pt[1] + template_h), (0, 0, 255), 2) + + # 3. Folder icon detection using color and shape (backup method) + if not results: + # Convert to HSV for better color segmentation + hsv = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2HSV) + + # Define color ranges for common folder icons (yellow folders in Windows) + lower_yellow = np.array([20, 100, 100]) + upper_yellow = np.array([40, 255, 255]) + + # Create mask for yellow color + mask = cv2.inRange(hsv, lower_yellow, upper_yellow) + + # Find contours in the mask + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + # Filter contours by size (folder icons are usually of similar size) + min_area = 100 + max_area = 5000 + + for contour in contours: + area = cv2.contourArea(contour) + if min_area < area < max_area: + # Get center of contour + M = cv2.moments(contour) + if M["m00"] > 0: + x = int(M["m10"] / M["m00"]) + y = int(M["m01"] / M["m00"]) + + # Add to results with lower confidence + results.append((x, y, 0.5)) + + # Draw for debugging + cv2.drawContours(screenshot_rgb, [contour], -1, (255, 0, 0), 2) + + # Save the annotated screenshot for debugging + cv2.imwrite("debug_target_detection.jpg", screenshot_rgb) + + if results: + # Sort by confidence + results.sort(key=lambda x: x[2], reverse=True) + best_match = results[0] + + # Convert to percentage of screen size + screen_width, screen_height = screenshot.size + x_percent = best_match[0] / screen_width + y_percent = best_match[1] / screen_height + + return (x_percent, y_percent) + + return None + + +# def enhanced_click(target_description, model=None): +# """ +# Enhanced clicking function that uses computer vision to find and click on targets. +# +# Args: +# target_description (str): Description of what to click on +# model (str, optional): Model name for verification +# +# Returns: +# bool: True if click was successful, False otherwise +# """ +# # Try to find the target using computer vision +# coords = find_icon_on_screen(target_description) +# +# if coords: +# x_percent, y_percent = coords +# print(f"[Visual Target Finder] Found target '{target_description}' at ({x_percent:.3f}, {y_percent:.3f})") +# +# # Convert percentages to actual screen coordinates +# screen_width, screen_height = pyautogui.size() +# x_coord = int(x_percent * screen_width) +# y_coord = int(y_percent * screen_height) +# +# # Click on the found location +# pyautogui.click(x_coord, y_coord) +# return True +# else: +# print(f"[Visual Target Finder] Could not find target '{target_description}' on screen") +# return False + + +import pyautogui +import platform +import ctypes +import subprocess + + +def get_scaling_factor(): + """ + Detect the current DPI scaling factor based on the operating system. + Returns: + scaling_factor (float): A multiplier to adjust coordinates. + """ + os_name = platform.system() + scaling_factor = 1.0 + + if os_name == "Windows": + try: + user32 = ctypes.windll.user32 + user32.SetProcessDPIAware() + dc = user32.GetDC(0) + logical_width = user32.GetDeviceCaps(dc, 8) # HORZRES (logical width) + physical_width = user32.GetDeviceCaps(dc, 118) # DESKTOPHORZRES (physical width) + scaling_factor = physical_width / logical_width + user32.ReleaseDC(0, dc) + except Exception as e: + print("Windows scaling detection error:", e) + scaling_factor = 1.0 + elif os_name == "Darwin": # macOS + try: + output = subprocess.check_output(["system_profiler", "SPDisplaysDataType"]) + output = output.decode("utf-8") + if "Retina" in output: + scaling_factor = 2.0 + else: + scaling_factor = 1.0 + except Exception as e: + print("macOS scaling detection error:", e) + scaling_factor = 1.0 + elif os_name == "Linux": + try: + output = subprocess.check_output( + ["gsettings", "get", "org.gnome.desktop.interface", "scaling-factor"] ) - print(f"{ANSI_BLUE}Objective Complete: {ANSI_RESET}{summary}\n") - return True - elif operate_type == "wait": - duration = operation["duration"] - if isinstance(duration, str): - duration = float(duration) # Convert string to float/integer - print(f"{ANSI_GREEN}[Self-Operating Computer | Waiting for {duration} seconds...]{ANSI_RESET}") + scaling_factor = float(output.decode("utf-8").strip()) + except Exception as e: + print("Linux scaling detection error:", e) + scaling_factor = 1.0 + + return scaling_factor + + +def click_relative(x_percent, y_percent, x_divisor=1.50, y_multiplier=1.25): + """ + Converts relative coordinates to absolute screen coordinates, applies DPI scaling, + then divides the x-coordinate by x_divisor and multiplies the y-coordinate by y_multiplier before clicking. + + Args: + x_percent (float): Relative x-coordinate (e.g., 0.10 for 10% across). + y_percent (float): Relative y-coordinate (e.g., 0.20 for 20% down). + x_divisor (float): Value to divide the computed x-coordinate by (default 1.50). + y_multiplier (float): Value to multiply the computed y-coordinate by (default 1.25). + """ + screen_width, screen_height = pyautogui.size() + scaling_factor = get_scaling_factor() + + # Compute the base absolute coordinates. + base_x = x_percent * screen_width * scaling_factor + base_y = y_percent * screen_height * scaling_factor + + # Adjust: divide x-coordinate and multiply y-coordinate. + adjusted_x = int(base_x / x_divisor) + adjusted_y = int(base_y * y_multiplier) + + print( + f"Clicking at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen with scaling factor {scaling_factor}") + pyautogui.click(adjusted_x, adjusted_y) + + +def operate(operations, session_id, model=None): + """ + Processes a list of operations and executes them. + Supports click, doubleclick, write, press, wait, and done operations. + For click operations, it uses the adjusted coordinate conversion: + - x-coordinate divided by 1.50. + - y-coordinate multiplied by 1.25. + """ + import time + + for op in operations: + if op.get("operation") in ["click", "doubleclick"]: + try: + x_percent = float(op.get("x", 0)) + y_percent = float(op.get("y", 0)) + screen_width, screen_height = pyautogui.size() + scaling_factor = get_scaling_factor() + + # Compute the base absolute coordinates. + base_x = x_percent * screen_width * scaling_factor + base_y = y_percent * screen_height * scaling_factor + + # Adjust: divide x-coordinate and multiply y-coordinate. + adjusted_x = int(base_x / 1.50) + adjusted_y = int(base_y * 1.25) + + print( + f"{'Double-clicking' if op.get('operation') == 'doubleclick' else 'Clicking'} " + f"at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen " + f"with scaling factor {scaling_factor}" + ) + + if op.get("operation") == "doubleclick": + pyautogui.doubleClick(adjusted_x, adjusted_y) + else: + pyautogui.click(adjusted_x, adjusted_y) + except Exception as e: + print( + f"Error performing {'double-click' if op.get('operation') == 'doubleclick' else 'click'} operation:", + e) + + elif op.get("operation") == "write": + content = op.get("content", "") + pyautogui.write(content) + + elif op.get("operation") == "press": + keys = op.get("keys", []) + for key in keys: + pyautogui.press(key) + + elif op.get("operation") == "wait": + duration = float(op.get("duration", 1)) time.sleep(duration) - else: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] unknown operation response :({ANSI_RESET}" - ) - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response {ANSI_RESET}{operation}" - ) - return True - print( - f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]" - ) - print(f"{operate_thought}") - print(f"{ANSI_BLUE}Action: {ANSI_RESET}{operate_type} {operate_detail}\n") + elif op.get("operation") == "done": + print("Operation completed:", op.get("summary", "")) + return True # Stop processing further operations - return False + return False # Continue processing From 6c1c01516b674b63e33ca45ee54ab44d1396a1d8 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 17:14:32 +0100 Subject: [PATCH 24/37] Update apis.py added reliable claude 3.7 usability --- operate/models/apis.py | 493 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 482 insertions(+), 11 deletions(-) diff --git a/operate/models/apis.py b/operate/models/apis.py index 22afd718..7db63ac1 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -142,13 +142,373 @@ def call_gpt_4o(messages): return call_gpt_4o(messages) +def extract_target_from_text(text): + """ + Extract target file/folder names from text with intelligent priority. + + Args: + text (str): Text to analyze (thought or operation text) + + Returns: + str: The extracted target description + """ + import re + + # Priority 1: Look for quoted text which often indicates file/folder names + quoted_pattern = re.compile(r"['\"]([^'\"]+)['\"]") + quoted_matches = quoted_pattern.findall(text) + if quoted_matches: + return quoted_matches[0] + + # Priority 2: Look for file/folder patterns (word-word or words with extensions) + file_pattern = re.compile(r"(\w+[-\.]\w+[-\.]\w+|\w+[-\.]\w+)") + file_matches = file_pattern.findall(text) + for match in file_matches: + # Filter out things that don't look like folder/file names + if any(x in match.lower() for x in ['-main', 'folder', 'file', 'image', 'doc', '.', 'sbc']): + return match + + # Priority 3: Look for phrases after "click on X" or "open X" + click_phrases = ["click on ", "click the ", "clicking on ", "clicking the ", "open ", "opening "] + for phrase in click_phrases: + if phrase in text.lower(): + parts = text.lower().split(phrase, 1) + if len(parts) > 1: + # Extract up to a period, comma, or space + target = parts[1].split(".")[0].split(",")[0].strip() + # Only return if it's not too long (likely not a file name if very long) + if 2 <= len(target.split()) <= 5: + return target + + # Priority 4: Look for capitalized words which might be file/folder names + cap_word_pattern = re.compile(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b') + cap_matches = cap_word_pattern.findall(text) + if cap_matches: + # Filter to likely file/folder names + likely_matches = [m for m in cap_matches if len(m) > 3] + if likely_matches: + return likely_matches[0] + + # Default: just return the original text if nothing better found + return text + + +def find_ui_element_by_text_and_vision(target_description, screenshot_filename): + """ + Finds UI elements using multiple methods: text OCR, template matching, and shape detection. + Specialized for finding desktop icons, folders, and common UI elements. + + Args: + target_description (str): Description of what we're trying to find (e.g., "sbc-images-main") + screenshot_filename (str): Path to screenshot file + + Returns: + tuple: (x_percent, y_percent) coordinates as percentages of screen width/height, or None if not found + """ + import cv2 + import numpy as np + from PIL import Image + import easyocr + import os + import re + + # Clean up the target description for better matching + target_words = target_description.lower().split() + # Remove common words that don't help with identification + stop_words = ['the', 'a', 'an', 'to', 'on', 'in', 'by', 'it', 'this', 'that', 'for', 'with', 'click', 'double'] + target_words = [word for word in target_words if word not in stop_words] + clean_target = ' '.join(target_words) + + print(f"[Target Finder] Looking for: '{clean_target}'") + + # Load the screenshot + screenshot = Image.open(screenshot_filename) + screenshot_np = np.array(screenshot) + screenshot_rgb = cv2.cvtColor(screenshot_np, cv2.COLOR_RGB2BGR) + + # Create a debug image to visualize findings + debug_img = screenshot_rgb.copy() + + # Results will store all potential matches with their confidence scores + results = [] + + # APPROACH 1: Template matching with saved templates + icon_folder = "icon_templates" + if os.path.exists(icon_folder) and any(os.listdir(icon_folder)): + for filename in os.listdir(icon_folder): + if filename.endswith(('.png', '.jpg')): + # Extract the template name for matching + template_name = filename.replace('_', ' ').replace('.png', '').replace('.jpg', '') + + # Check if template name matches any part of the target + if any(word in template_name.lower() for word in target_words) or \ + any(word in clean_target for word in template_name.lower().split()): + + template_path = os.path.join(icon_folder, filename) + template = cv2.imread(template_path) + + if template is None: + continue + + # Apply template matching + res = cv2.matchTemplate(screenshot_rgb, template, cv2.TM_CCOEFF_NORMED) + min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res) + + if max_val > 0.7: # Good match + template_h, template_w = template.shape[:2] + top_left = max_loc + bottom_right = (top_left[0] + template_w, top_left[1] + template_h) + center_x = top_left[0] + template_w // 2 + center_y = top_left[1] + template_h // 2 + + # Add to results with high confidence since it's a template match + match_score = max_val * 1.5 # Boost template matches + results.append({ + "type": "template", + "confidence": match_score, + "center": (center_x, center_y), + "bbox": (top_left[0], top_left[1], bottom_right[0], bottom_right[1]) + }) + + # Draw on debug image + cv2.rectangle(debug_img, top_left, bottom_right, (0, 255, 0), 2) + cv2.putText(debug_img, f"Template: {template_name} ({match_score:.2f})", + (top_left[0], top_left[1] - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) + + # APPROACH 2: OCR text detection + try: + # Initialize EasyOCR Reader + reader = easyocr.Reader(["en"]) + + # Read the screenshot + ocr_results = reader.readtext(screenshot_filename) + + for idx, (bbox, text, conf) in enumerate(ocr_results): + text_lower = text.lower() + + # Check for any word match + word_match = False + for word in target_words: + if len(word) > 2 and word in text_lower: # Avoid matching very short words + word_match = True + break + + # Calculate match score based on text similarity + if word_match or clean_target in text_lower or text_lower in clean_target: + # Calculate match score + from difflib import SequenceMatcher + similarity = SequenceMatcher(None, clean_target, text_lower).ratio() + match_score = similarity * conf + + # Especially boost exact matches or strong partial matches + if similarity > 0.8: + match_score *= 1.5 + + # Get center of text bounding box + bbox_points = np.array(bbox).astype(int) + center_x = np.mean([p[0] for p in bbox_points]) + center_y = np.mean([p[1] for p in bbox_points]) + + # Calculate bounding box rectangle + x_points = [p[0] for p in bbox_points] + y_points = [p[1] for p in bbox_points] + bbox_rect = (min(x_points), min(y_points), max(x_points), max(y_points)) + + # Add to results + results.append({ + "type": "text", + "text": text, + "confidence": match_score, + "center": (center_x, center_y), + "bbox": bbox_rect + }) + + # Draw on debug image + top_left = (int(bbox_rect[0]), int(bbox_rect[1])) + bottom_right = (int(bbox_rect[2]), int(bbox_rect[3])) + cv2.rectangle(debug_img, top_left, bottom_right, (0, 0, 255), 2) + cv2.putText(debug_img, f"OCR: {text} ({match_score:.2f})", + (top_left[0], top_left[1] - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) + + # For text results, look for potential UI elements above (desktop icon case) + # If this looks like a desktop icon label, the actual icon is likely above it + if any(word in text_lower for word in ['folder', 'file', 'image', 'doc']) or \ + re.search(r'\w+[-\.]\w+', text_lower) or \ + "sbc" in text_lower: + # Define a region above the text to look for the icon + icon_area_width = bbox_rect[2] - bbox_rect[0] + icon_area_height = icon_area_width # Make it square + icon_area_top = max(0, bbox_rect[1] - icon_area_height - 10) # Above text with a small gap + icon_area_left = bbox_rect[0] + + icon_center_x = icon_area_left + icon_area_width // 2 + icon_center_y = icon_area_top + icon_area_height // 2 + + # Add this as a potential icon location with boosted confidence + icon_match_score = match_score * 1.2 # Boost confidence for icon targets + results.append({ + "type": "icon", + "confidence": icon_match_score, + "center": (icon_center_x, icon_center_y), + "bbox": (icon_area_left, icon_area_top, + icon_area_left + icon_area_width, icon_area_top + icon_area_height) + }) + + # Draw the potential icon area + cv2.rectangle(debug_img, + (int(icon_area_left), int(icon_area_top)), + (int(icon_area_left + icon_area_width), int(icon_area_top + icon_area_height)), + (255, 0, 0), 2) + cv2.putText(debug_img, f"Icon target ({icon_match_score:.2f})", + (int(icon_area_left), int(icon_area_top) - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2) + + except Exception as e: + print(f"[Target Finder] OCR detection error: {e}") + + # APPROACH 3: Folder icon detection (color/shape based) + if "folder" in clean_target or "file" in clean_target or "sbc" in clean_target: + try: + # Convert to HSV for better color segmentation + hsv = cv2.cvtColor(screenshot_rgb, cv2.COLOR_BGR2HSV) + + # Define color ranges for common folder icons (yellow folders in Windows) + lower_yellow = np.array([20, 100, 100]) + upper_yellow = np.array([40, 255, 255]) + + # Create mask for yellow color + mask = cv2.inRange(hsv, lower_yellow, upper_yellow) + + # Find contours in the mask + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + # Filter contours by size (folder icons are usually of similar size) + min_area = 100 + max_area = 5000 + + for contour in contours: + area = cv2.contourArea(contour) + if min_area < area < max_area: + # Get center of contour + M = cv2.moments(contour) + if M["m00"] > 0: + center_x = int(M["m10"] / M["m00"]) + center_y = int(M["m01"] / M["m00"]) + + # Get bounding box + x, y, w, h = cv2.boundingRect(contour) + + # Add to results with lower confidence for shape-based detection + match_score = 0.5 # Base confidence for shape detection + results.append({ + "type": "shape", + "confidence": match_score, + "center": (center_x, center_y), + "bbox": (x, y, x + w, y + h) + }) + + # Draw on debug image + cv2.rectangle(debug_img, (x, y), (x + w, y + h), (255, 255, 0), 2) + cv2.putText(debug_img, f"Shape ({match_score:.2f})", + (x, y - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2) + except Exception as e: + print(f"[Target Finder] Shape detection error: {e}") + + # Save the debug image + cv2.imwrite("debug_target_detection.jpg", debug_img) + + if results: + # Sort by confidence + results.sort(key=lambda x: x.get("confidence", 0), reverse=True) + best_match = results[0] + + # Print debug info + print(f"[Target Finder] Best match: {best_match['type']} with confidence {best_match['confidence']:.2f}") + + # Get the center point + center_x, center_y = best_match["center"] + + # Convert to percentage of screen size + screen_width, screen_height = screenshot.size + x_percent = center_x / screen_width + y_percent = center_y / screen_height + + # Mark the final target on the debug image + result_img = cv2.circle(debug_img, (int(center_x), int(center_y)), 10, (0, 255, 255), -1) + cv2.imwrite("debug_final_target.jpg", result_img) + + return (x_percent, y_percent) + + print(f"[Target Finder] No match found for '{clean_target}'") + return None + + +def verify_success(screenshot_before, task_type="open_folder"): + """ + Verifies if an operation was successful by comparing before/after screenshots. + + Args: + screenshot_before: Screenshot taken before the operation + task_type: Type of task we're verifying (open_folder, click_button, etc.) + + Returns: + bool: True if operation appears successful, False otherwise + """ + import cv2 + import numpy as np + import pyautogui + + # Take a screenshot after the operation + screenshot_after = pyautogui.screenshot() + + # Convert to numpy arrays for comparison + before_np = np.array(screenshot_before) + after_np = np.array(screenshot_after) + + # Resize if dimensions don't match + if before_np.shape != after_np.shape: + after_np = cv2.resize(after_np, (before_np.shape[1], before_np.shape[0])) + + # For opening a folder, check for significant window change + if task_type == "open_folder": + # Calculate difference between images + diff = cv2.absdiff(before_np, after_np) + gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY) + _, thresholded = cv2.threshold(gray_diff, 30, 255, cv2.THRESH_BINARY) + + # Calculate percentage of changed pixels + changed_pixels = np.count_nonzero(thresholded) + total_pixels = thresholded.size + change_percentage = (changed_pixels / total_pixels) * 100 + + # Save debug images + cv2.imwrite("debug_before.jpg", cv2.cvtColor(before_np, cv2.COLOR_RGB2BGR)) + cv2.imwrite("debug_after.jpg", cv2.cvtColor(after_np, cv2.COLOR_RGB2BGR)) + cv2.imwrite("debug_diff.jpg", thresholded) + + print(f"[Verification] Screen change: {change_percentage:.2f}%") + + # If significant portion of screen changed, likely a new window opened + return change_percentage > 15 + + return False + + def call_claude_37(messages): if config.verbose: print("[call_claude_37]") time.sleep(1) - # Import the anthropic module inside the function to ensure it's available + # Import all required modules import anthropic + import cv2 + import numpy as np + import re + import pyautogui + from PIL import Image try: screenshots_dir = "screenshots" @@ -243,17 +603,129 @@ def call_claude_37(messages): # Extract the content from the response content = response.content[0].text - content = clean_json(content) - - # Create assistant message - assistant_message = {"role": "assistant", "content": content} + # Check if Claude added text before the JSON + if content.strip().startswith("[") or content.strip().startswith("{"): + # Content is already in JSON format, just clean it + content = clean_json(content) + else: + # Claude might have added a message before the JSON + # Try to find JSON in the content + json_match = re.search(r'(\[.*\]|\{.*\})', content, re.DOTALL) + if json_match: + # Extract the JSON part + content = clean_json(json_match.group(1)) + else: + # If no JSON found, try to create a done operation + if "done" in content.lower() or "complete" in content.lower(): + content = '[{"thought": "Task complete", "operation": "done"}]' + else: + # Create a fallback operation + content = '[{"thought": "Continuing task", "operation": "wait", "duration": 1}]' + + # Log the cleaned content if config.verbose: - print("[call_claude_37] content", content) + print("[call_claude_37] cleaned content", content) - content = json.loads(content) + # Create assistant message with the original response + assistant_message = {"role": "assistant", "content": response.content[0].text} + + try: + # Try to parse as JSON + parsed_content = json.loads(content) + if config.verbose: + print("[call_claude_37] Successfully parsed content as JSON") + except json.JSONDecodeError as e: + # If JSON parsing fails, create a simple operation + print(f"[call_claude_37] JSON parsing failed: {e}. Creating fallback operation.") + parsed_content = [{"thought": "Continuing with task", "operation": "wait", "duration": 1}] + + # Process the operations with enhanced handling + processed_content = [] + + # Check if Claude is trying to do a double-click + need_double_click = False + for operation in parsed_content: + if operation.get("double_click", False): + need_double_click = True + break + if "thought" in operation: + if "double" in operation["thought"].lower() and "click" in operation["thought"].lower(): + need_double_click = True + break + + for i, operation in enumerate(parsed_content): + if operation.get("operation") == "click": + # Extract target description + target_description = "" + if "text" in operation: + target_description = operation.get("text") + elif "thought" in operation: + # Try to extract what we're clicking on from the thought + thought = operation.get("thought", "") + + # Look for quoted text first + quoted_match = re.search(r'[\'"]([^\'\"]+)[\'"]', thought) + if quoted_match: + target_description = quoted_match.group(1) + else: + # Look for instances of "sbc-images-main" or similar patterns + pattern_match = re.search(r'(\b\w+-\w+-\w+\b|\bsbc[- ]\w+\b)', thought, re.IGNORECASE) + if pattern_match: + target_description = pattern_match.group(1) + else: + # Fall back to looking for phrases after click indicators + click_indicators = ["click on", "click the", "clicking on", "clicking the"] + for indicator in click_indicators: + if indicator in thought.lower(): + parts = thought.lower().split(indicator, 1) + if len(parts) > 1: + target_description = parts[1].split(".")[0].split(",")[0].strip() + break + + if not target_description: + target_description = f"target at position ({operation['x']}, {operation['y']})" + + if config.verbose: + print(f"[call_claude_37] Target description: {target_description}") + + # Handle double-clicking if detected + if need_double_click and i == 0: # Only process the first click for double-click + # Extract coordinates + try: + x = operation["x"] + y = operation["y"] + + # Add a special marker to signal double-click + operation["double_click"] = True + + # Log the double-click intention + print( + f"[call_claude_37] Detected double-click operation on '{target_description}' at ({x}, {y})") + except Exception as e: + print(f"[call_claude_37] Error processing double-click: {e}") + + # For double-click operations, we only need to add the first click + # Skip adding second clicks to avoid duplicate operations + if need_double_click and i > 0: + if config.verbose: + print("[call_claude_37] Skipping duplicate click for double-click operation") + continue + + # Add the operation + if config.verbose: + print(f"[call_claude_37] Adding operation: {operation}") + + processed_content.append(operation) + else: + # For non-click operations, just append as is + processed_content.append(operation) + + # Add the assistant message to the history messages.append(assistant_message) - return content + + # Return the processed content + return processed_content if processed_content else [{"operation": "wait", "duration": 1}] except Exception as e: error_msg = str(e) @@ -275,9 +747,8 @@ def call_claude_37(messages): if config.verbose: traceback.print_exc() - # Fall back to GPT-4o - return call_gpt_4o(messages) - + # If an exception occurs, return a simple operation to keep things moving + return [{"thought": "Continuing task after error", "operation": "wait", "duration": 1}] async def call_qwen_vl_with_ocr(messages, objective, model): if config.verbose: print("[call_qwen_vl_with_ocr]") From 5d55d53eee358d7e1d51d4be39c655554a8abac8 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 17:15:35 +0100 Subject: [PATCH 25/37] Update prompts.py added double click functionality --- operate/models/prompts.py | 88 +++++++++++++++++++++++++++++---------- 1 file changed, 65 insertions(+), 23 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index f8ddad18..ad660686 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -13,28 +13,34 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click ``` [{{ "thought": "write a thought here", "operation": "click", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format ``` -2. write - Write with your keyboard +2. doubleclick - Move mouse and double click +``` +[{{ "thought": "write a thought here", "operation": "doubleclick", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format +``` + +3. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -3. press - Use a hotkey or press key to operate the computer +4. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -4. done - The objective is completed +5. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` -5. wait - Wait some time for a page to load + +6. wait - Wait some time for a page to load ``` [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` @@ -52,7 +58,14 @@ ] ``` -Example 2: Focuses on the address bar in a browser before typing a website +Example 2: Double-clicking to open a file or application +``` +[ + {{ "thought": "I want to open a file or application by double-clicking", "operation": "doubleclick", "x": "0.50", "y": "0.60" }} +] +``` + +Example 3: Focuses on the address bar in a browser before typing a website ``` [ {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, @@ -61,7 +74,7 @@ ] ``` -Example 3: Waits to the page to load before proceeding to interact +Example 4: Waits to the page to load before proceeding to interact ``` [ {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5"}}, @@ -83,27 +96,34 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` ``` [{{ "thought": "write a thought here", "operation": "click", "label": "~x" }}] # 'percent' refers to the percentage of the screen's dimensions in decimal format ``` -2. write - Write with your keyboard + +2. doubleclick - Move mouse and double click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` +``` +[{{ "thought": "write a thought here", "operation": "doubleclick", "label": "~x" }}] +``` + +3. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -3. press - Use a hotkey or press key to operate the computer + +4. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -4. done - The objective is completed +5. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` -5. wait - Wait some time for a page to load +6. wait - Wait some time for a page to load ``` [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` @@ -120,7 +140,14 @@ ] ``` -Example 2: Focuses on the address bar in a browser before typing a website +Example 2: Double-clicking to open a file or application with a labeled element +``` +[ + {{ "thought": "I want to open a file or application by double-clicking on its labeled element", "operation": "doubleclick", "label": "~42" }} +] +``` + +Example 3: Focuses on the address bar in a browser before typing a website ``` [ {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, @@ -129,7 +156,7 @@ ] ``` -Example 3: Send a "Hello World" message in the chat +Example 4: Send a "Hello World" message in the chat ``` [ {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }}, @@ -137,7 +164,7 @@ ] ``` -Example 4: Waits to the page to load before proceeding to interact +Example 5: Waits to the page to load before proceeding to interact ``` [ {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }}, @@ -154,32 +181,39 @@ """ -# TODO: Add an example or instruction about `Action: press ['pagedown']` to scroll SYSTEM_PROMPT_OCR = """ You are operating a {operating_system} computer, using the same operating system as a human. From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 5 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method. ``` [{{ "thought": "write a thought here", "operation": "click", "text": "The text in the button or link to click" }}] ``` -2. write - Write with your keyboard + +2. doubleclick - Move mouse and double click - Look for text to double click +``` +[{{ "thought": "write a thought here", "operation": "doubleclick", "text": "The text in the item to double click" }}] +``` + +3. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -3. press - Use a hotkey or press key to operate the computer + +4. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -4. done - The objective is completed + +5. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` -5. wait - Wait some time for a page to load +6. wait - Wait some time for a page to load ``` [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` @@ -206,7 +240,14 @@ ] ``` -Example 3: Search for someone on Linkedin when already on linkedin.com +Example 3: Double-clicking to open a file +``` +[ + {{ "thought": "I want to open a file by finding its text label and double-clicking", "operation": "doubleclick", "text": "my_document.txt" }} +] +``` + +Example 4: Search for someone on Linkedin when already on linkedin.com ``` [ {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, @@ -214,7 +255,8 @@ {{ "thought": "Finally I'll submit the search form with enter", "operation": "press", "keys": ["enter"] }} ] ``` -Example 4: Waits to the page to load before proceeding to interact + +Example 5: Waits to the page to load before proceeding to interact ``` [ {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }}, From 40523ef4ecc701f1a63c5ca28b97452cc60ac1f8 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 17:16:29 +0100 Subject: [PATCH 26/37] Update config.py From 28272bb69bcc103aa2b83925de3c2eb7babee1c5 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 17:19:17 +0100 Subject: [PATCH 27/37] A icon library for computer vision You can place the mouse over an icon and hit enter to make a screenshot of the icons area, the png will be saved on a folder for further use in guideing the SOC through icons. --- operate/setup_icon_templates.py | 117 ++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 operate/setup_icon_templates.py diff --git a/operate/setup_icon_templates.py b/operate/setup_icon_templates.py new file mode 100644 index 00000000..d9ad620b --- /dev/null +++ b/operate/setup_icon_templates.py @@ -0,0 +1,117 @@ +import os +import pyautogui +import time +import tkinter as tk +from tkinter import simpledialog, messagebox + + +def setup_icon_templates(): + """ + Simplified helper script to set up icon templates for visual target finding. + Uses simple coordinate input rather than visual selection. + """ + # Create templates directory if it doesn't exist + template_dir = "icon_templates" + if not os.path.exists(template_dir): + os.makedirs(template_dir) + print(f"Created directory: {template_dir}") + + # Create a simple GUI for capturing templates + root = tk.Tk() + root.title("Icon Template Capture Tool") + root.geometry("400x200") + + # Function to capture icon at cursor position + def capture_at_cursor(): + icon_name = simpledialog.askstring("Icon Name", "Enter name for this icon/folder:", parent=root) + if not icon_name: + return + + # Give user time to position cursor + messagebox.showinfo("Capture Icon", + "Position your mouse cursor over the center of the icon you want to capture, then click OK.") + + # Get cursor position + time.sleep(0.5) # Small delay after dialog closes + x, y = pyautogui.position() + + # Capture region around cursor (100x100 pixels) + region_size = 50 # pixels in each direction from center + region = (x - region_size, y - region_size, region_size * 2, region_size * 2) + + try: + # Capture the region + screenshot = pyautogui.screenshot(region=region) + + # Save the template + filename = f"{icon_name.replace(' ', '_').lower()}.png" + filepath = os.path.join(template_dir, filename) + screenshot.save(filepath) + + messagebox.showinfo("Success", f"Saved template as {filename}") + print(f"Saved template as {filepath}") + except Exception as e: + messagebox.showerror("Error", f"Failed to capture: {str(e)}") + print(f"Error: {e}") + + # Function to capture custom region + def capture_custom_region(): + icon_name = simpledialog.askstring("Icon Name", "Enter name for this icon/folder:", parent=root) + if not icon_name: + return + + # Ask for region coordinates + try: + x = simpledialog.askinteger("X Coordinate", "Enter X coordinate (left edge):", parent=root) + if x is None: return + + y = simpledialog.askinteger("Y Coordinate", "Enter Y coordinate (top edge):", parent=root) + if y is None: return + + width = simpledialog.askinteger("Width", "Enter width in pixels:", parent=root, minvalue=10, maxvalue=500) + if width is None: return + + height = simpledialog.askinteger("Height", "Enter height in pixels:", parent=root, minvalue=10, + maxvalue=500) + if height is None: return + + # Capture the specified region + region = (x, y, width, height) + screenshot = pyautogui.screenshot(region=region) + + # Save the template + filename = f"{icon_name.replace(' ', '_').lower()}.png" + filepath = os.path.join(template_dir, filename) + screenshot.save(filepath) + + messagebox.showinfo("Success", f"Saved template as {filename}") + print(f"Saved template as {filepath}") + except Exception as e: + messagebox.showerror("Error", f"Failed to capture: {str(e)}") + print(f"Error: {e}") + + # Create and place buttons + label = tk.Label(root, text="Icon Template Capture Tool", font=("Arial", 14)) + label.pack(pady=10) + + instructions = tk.Label(root, text="Choose a capture method:") + instructions.pack(pady=5) + + button_frame = tk.Frame(root) + button_frame.pack(pady=10) + + cursor_btn = tk.Button(button_frame, text="Capture at Cursor", command=capture_at_cursor, width=20) + cursor_btn.grid(row=0, column=0, padx=10, pady=5) + + region_btn = tk.Button(button_frame, text="Specify Region Manually", command=capture_custom_region, width=20) + region_btn.grid(row=0, column=1, padx=10, pady=5) + + close_btn = tk.Button(root, text="Close", command=root.destroy, width=10) + close_btn.pack(pady=10) + + # Start the GUI + root.mainloop() + + +if __name__ == "__main__": + setup_icon_templates() \ No newline at end of file From 6987cdcfddb0c617d4855b00c9320dc461354c6b Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sat, 1 Mar 2025 21:17:02 +0100 Subject: [PATCH 28/37] Update README.md Small spelling correction --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cb92f29f..9e53992b 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ operate -m o1-with-ocr ### Multimodal Models `-m` #### Try claude 3.7 `-m claude-3.7` -Use Clude 3.7 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Antheopic dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. +Use Clude 3.7 with Vision to see how it stacks up to GPT-4-Vision at operating a computer. Navigate to the [Anthropic dashboard](https://console.anthropic.com/dashboard) to get an API key and run the command below to try it. ``` operate -m claude-3.7 From dad71cf9aca2eef38d69c6df5d5e77b6db0736f4 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sun, 2 Mar 2025 21:41:44 +0100 Subject: [PATCH 29/37] Add files via upload --- GUI_README.md | 121 +++++++++++ gui.py | 563 ++++++++++++++++++++++++++++++++++++++++++++++++++ gui_main.py | 105 ++++++++++ 3 files changed, 789 insertions(+) create mode 100644 GUI_README.md create mode 100644 gui.py create mode 100644 gui_main.py diff --git a/GUI_README.md b/GUI_README.md new file mode 100644 index 00000000..c819af77 --- /dev/null +++ b/GUI_README.md @@ -0,0 +1,121 @@ +# Self-Operating Computer GUI + +A graphical user interface for the Self-Operating Computer, allowing easy interaction with AI models to automate computer tasks. + +## Features + +- **Intuitive Chat Interface**: Communicate with the Self-Operating Computer through a familiar chat interface +- **Live Screenshot Preview**: See what the AI sees in real-time +- **Model Selection**: Choose from multiple AI models including GPT-4, Claude, Qwen, and more +- **Voice Control**: Speak your commands using the built-in voice recognition (requires whisper_mic) +- **Real-time Logs**: Monitor detailed logs of operations in real-time +- **Multi-platform**: Works on Windows, macOS, and Linux + +## Installation + +### Prerequisites + +- Python 3.8 or higher +- Self-Operating Computer installed and configured +- pip (Python package manager) + +### Required Packages + +```bash +pip install PyQt5 +pip install whisper_mic # Optional, for voice commands +``` + +## Usage + +### Running the GUI + +From the Self-Operating Computer directory: + +```bash +python gui_main.py +``` + +### Command Line Options + +``` +usage: gui_main.py [-h] [-m MODEL] [--verbose] [--light] + +Run the Self-Operating Computer GUI with a specified model. + +optional arguments: + -h, --help show this help message and exit + -m MODEL, --model MODEL + Specify the default model to use + --verbose Run with verbose logging + --light Use light mode instead of dark mode +``` + +### Examples + +```bash +# Run with GPT-4 model and verbose logging +python gui_main.py -m gpt-4-vision --verbose + +# Run with Claude 3 model in light mode +python gui_main.py -m claude-3 --light +``` + +## Interface Guide + +The GUI is divided into several sections: + +1. **Top Bar**: Contains model selection dropdown and verbose mode toggle +2. **Left Panel**: Displays the current screenshot that the AI sees +3. **Right Panel - Top**: Chat history showing your requests and system messages +4. **Right Panel - Bottom**: Detailed logs of operations in real-time +5. **Bottom Input**: Text field for typing tasks, Send button, and voice recording button + +## Model Support + +The GUI supports all models that the Self-Operating Computer supports: + +- GPT-4 Vision +- GPT-4 with SOM (Spatial Object Memory) +- GPT-4 with OCR +- Claude 3 +- Claude 3.7 +- Qwen-VL +- O1 with OCR +- Gemini Pro Vision +- LLaVA + +## API Keys + +The GUI uses the same API key configuration as the main Self-Operating Computer. If a required API key is missing, a prompt will appear asking you to enter it. + +## Troubleshooting + +### Voice Recognition Not Working + +Make sure you have installed whisper_mic: +```bash +pip install whisper_mic +``` + +### GUI Not Launching + +Check that PyQt5 is properly installed: +```bash +pip install PyQt5 +``` + +### Model Not Responding + +Ensure your API keys are properly configured in the Self-Operating Computer settings. + +## Integration with Existing Codebase + +The GUI integrates seamlessly with the existing Self-Operating Computer codebase: + +- It uses the same `operate.py` functions for executing tasks +- It leverages the same model APIs from `apis.py` +- It inherits configuration from `config.py` +- It preserves the same prompt formats from `prompts.py` + +The UI simply provides a graphical wrapper around these core components, making them more accessible to users who prefer not to use the comman \ No newline at end of file diff --git a/gui.py b/gui.py new file mode 100644 index 00000000..ac176bfc --- /dev/null +++ b/gui.py @@ -0,0 +1,563 @@ +import sys +import os +import time +import threading +import asyncio +import platform +import json +import base64 +from PyQt5.QtWidgets import QSizePolicy +from PyQt5.QtWidgets import ( + QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, + QTextEdit, QLineEdit, QPushButton, QComboBox, QCheckBox, + QLabel, QScrollArea, QFrame, QSplitter, QMessageBox, QProgressBar +) +from PyQt5.QtCore import Qt, QThread, pyqtSignal, pyqtSlot, QSize, QTimer +from PyQt5.QtGui import QFont, QIcon, QTextCursor, QColor, QPalette, QPixmap + +# Import directly from local modules +from operate.models.prompts import USER_QUESTION, get_system_prompt +from operate.config import Config +from operate.models.apis import get_next_action +from operate.utils.screenshot import capture_screen_with_cursor +from operate.exceptions import ModelNotRecognizedException +from operate.operate import operate, get_scaling_factor + +# Setup config +config = Config() + +# Define available models - match the models in apis.py +AVAILABLE_MODELS = [ + "gpt-4-vision", + "gpt-4-with-som", + "gpt-4-with-ocr", + "claude-3", + "claude-3.7", + "qwen-vl", + "o1-with-ocr", + "gemini-pro-vision", + "llava" +] + + +class LogRedirector: + """Redirects print output to the GUI log window""" + + def __init__(self, text_widget): + self.text_widget = text_widget + self.original_stdout = sys.stdout + self.original_stderr = sys.stderr + + def write(self, text): + self.original_stdout.write(text) + self.text_widget.append(text) + # Auto-scroll to bottom + self.text_widget.moveCursor(QTextCursor.End) + + def flush(self): + self.original_stdout.flush() + QApplication.processEvents() + + +class RecordButton(QPushButton): + """Custom button for voice recording that changes appearance when pressed""" + + def __init__(self, parent=None): + super().__init__(parent) + self.setText("Hold to Record") + self.setCheckable(True) + self.setStyleSheet(""" + QPushButton { + background-color: #f0f0f0; + border: 2px solid #c0c0c0; + border-radius: 15px; + padding: 8px; + color: #404040; + } + QPushButton:checked { + background-color: #ff4444; + color: white; + border: 2px solid #dd2222; + } + """) + self.mic = None + + +class ScreenshotDisplay(QLabel): + """Widget to display the current screenshot""" + + def __init__(self, parent=None): + super().__init__(parent) + self.setAlignment(Qt.AlignCenter) + self.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding) + self.setMinimumHeight(200) + self.setStyleSheet("background-color: #121212; border: 1px solid #333;") + self.setText("No screenshot available") + + def update_screenshot(self, filename): + if os.path.exists(filename): + pixmap = QPixmap(filename) + # Scale pixmap to fit widget while maintaining aspect ratio + scaled_pixmap = pixmap.scaled( + self.width(), self.height(), + Qt.KeepAspectRatio, Qt.SmoothTransformation + ) + self.setPixmap(scaled_pixmap) + else: + self.setText("Screenshot not found") + + def resizeEvent(self, event): + # If we have a pixmap, rescale it when the widget is resized + if hasattr(self, 'pixmap') and self.pixmap(): + scaled_pixmap = self.pixmap().scaled( + self.width(), self.height(), + Qt.KeepAspectRatio, Qt.SmoothTransformation + ) + self.setPixmap(scaled_pixmap) + super().resizeEvent(event) + + +class OperateThread(QThread): + update_signal = pyqtSignal(str) + completed_signal = pyqtSignal() + error_signal = pyqtSignal(str) + screenshot_signal = pyqtSignal(str) + + def __init__(self, model, objective, voice_mode=False, verbose_mode=False): + super().__init__() + self.model = model + self.objective = objective + self.voice_mode = voice_mode + self.verbose_mode = verbose_mode + self.running = True + + def run(self): + try: + config.verbose = self.verbose_mode + config.validation(self.model, self.voice_mode) + + mic = None + if self.voice_mode: + try: + from whisper_mic import WhisperMic + mic = WhisperMic() + self.update_signal.emit("Voice recognition initialized.") + except ImportError: + self.error_signal.emit( + "Voice mode requires 'whisper_mic' module. Install with 'pip install -r requirements-audio.txt'") + return + + system_prompt = get_system_prompt(self.model, self.objective) + system_message = {"role": "system", "content": system_prompt} + messages = [system_message] + loop_count = 0 + session_id = None + + self.update_signal.emit(f"Starting task: {self.objective}") + + task_completed = False + while not task_completed and self.running: + if config.verbose: + self.update_signal.emit(f"[Self Operating Computer] loop_count {loop_count}") + + # Capture screenshot for UI + screenshots_dir = "screenshots" + if not os.path.exists(screenshots_dir): + os.makedirs(screenshots_dir) + screenshot_filename = os.path.join(screenshots_dir, "screenshot.png") + capture_screen_with_cursor(screenshot_filename) + self.screenshot_signal.emit(screenshot_filename) + + # Get next action from the model + operations, session_id = self.run_async( + get_next_action(self.model, messages, self.objective, session_id) + ) + + # Process the operations and update task_completed accordingly + task_completed = operate(operations, session_id, self.model) + + loop_count += 1 + if loop_count > 10: + task_completed = True + self.update_signal.emit("[Self-Operating Computer] Max loop count reached. Task considered complete.") + + # If the thread was stopped by the user, we can check the running flag: + if not self.running: + self.update_signal.emit("Task stopped by the user.") + else: + self.update_signal.emit("Task completed.") + self.completed_signal.emit() + + except Exception as e: + self.error_signal.emit(f"Thread error: {str(e)}") + + def stop(self): + self.running = False + + def run_async(self, coroutine): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(coroutine) + finally: + loop.close() + + + +class VoiceRecordingThread(QThread): + finished_signal = pyqtSignal(str) + + def __init__(self, mic): + super().__init__() + self.mic = mic + + def run(self): + try: + # Call listen() without a stop_flag since it's not supported + result = self.mic.listen() + self.finished_signal.emit(result) + except Exception as e: + self.finished_signal.emit(f"Error: {str(e)}") + +class MessageWidget(QFrame): + """Widget to display a single message in the chat view""" + + def __init__(self, text, is_user=False, parent=None): + super().__init__(parent) + self.setFrameShape(QFrame.StyledPanel) + self.setStyleSheet( + "background-color: #c8c8c8; border-radius: 10px; margin: 5px;" if is_user else + "background-color: #d0d0d0; border-radius: 10px; margin: 5px;" + ) + + layout = QVBoxLayout(self) + + # Add a label for the sender + sender = QLabel("You:" if is_user else "System:") + sender.setStyleSheet("font-weight: bold; color: #333;") + layout.addWidget(sender) + + # Add the message text + message = QLabel(text) + message.setWordWrap(True) + message.setTextInteractionFlags(Qt.TextSelectableByMouse) + layout.addWidget(message) + + self.setLayout(layout) + + + +class SOCChatWindow(QMainWindow): + """Main chat window for the Self-Operating Computer interface""" + + def __init__(self): + super().__init__() + + self.setWindowTitle("Self-Operating Computer") + self.setMinimumSize(1000, 700) + + # Initialize mic to None + self.mic = None + self.operate_thread = None + + self.init_ui() + + # Try to initialize whisper_mic if available + try: + from whisper_mic import WhisperMic + self.mic = WhisperMic() + self.record_button.setEnabled(True) + except ImportError: + self.record_button.setEnabled(False) + self.record_button.setToolTip("Install whisper_mic module to use voice") + + def init_ui(self): + """Initialize the user interface""" + # Create the central widget and main layout + central_widget = QWidget() + main_layout = QVBoxLayout(central_widget) + + # Settings bar at the top + settings_layout = QHBoxLayout() + + # Model selection dropdown + model_label = QLabel("Model:") + self.model_combo = QComboBox() + self.model_combo.addItems(AVAILABLE_MODELS) + self.model_combo.setCurrentIndex( + AVAILABLE_MODELS.index("gpt-4-with-ocr") if "gpt-4-with-ocr" in AVAILABLE_MODELS else 0) + + # Verbose mode checkbox + self.verbose_checkbox = QCheckBox("Verbose Logs") + + # Add widgets to settings layout + settings_layout.addWidget(model_label) + settings_layout.addWidget(self.model_combo) + settings_layout.addWidget(self.verbose_checkbox) + settings_layout.addStretch(1) + + # Add settings to main layout + main_layout.addLayout(settings_layout) + + # Create a horizontal splitter for screenshot and chat views + h_splitter = QSplitter(Qt.Horizontal) + + # Left panel - Screenshot view + screenshot_container = QWidget() + screenshot_layout = QVBoxLayout(screenshot_container) + + # Screenshot label + screenshot_label = QLabel("Screen Preview:") + screenshot_layout.addWidget(screenshot_label) + + # Screenshot display + self.screenshot_display = ScreenshotDisplay() + screenshot_layout.addWidget(self.screenshot_display) + + h_splitter.addWidget(screenshot_container) + + # Right panel - Chat view and log + chat_log_splitter = QSplitter(Qt.Vertical) + + # Chat view area (top part of right panel) + chat_container = QWidget() + chat_layout = QVBoxLayout(chat_container) + + # Create the scrollable chat view + self.chat_scroll_area = QScrollArea() + self.chat_scroll_area.setWidgetResizable(True) + self.chat_content = QWidget() + self.chat_content_layout = QVBoxLayout(self.chat_content) + self.chat_content_layout.addStretch(1) # Push messages to the top + + self.chat_scroll_area.setWidget(self.chat_content) + chat_layout.addWidget(self.chat_scroll_area) + + # Input area + input_layout = QHBoxLayout() + + # Text input field + self.text_input = QLineEdit() + self.text_input.setPlaceholderText("Type your request here...") + self.text_input.returnPressed.connect(self.send_message) + + # Record button + self.record_button = RecordButton() + self.record_button.pressed.connect(self.start_recording) + self.record_button.released.connect(self.stop_recording) + + # Send button + self.send_button = QPushButton("Send") + self.send_button.clicked.connect(self.send_message) + + # **New Stop button** + self.stop_button = QPushButton("Stop") + self.stop_button.clicked.connect(self.stop_task) + + # Add widgets to input layout + input_layout.addWidget(self.text_input) + input_layout.addWidget(self.record_button) + input_layout.addWidget(self.send_button) + input_layout.addWidget(self.stop_button) # Add the Stop button + + # Add input area to chat layout + chat_layout.addLayout(input_layout) + + # Log view (bottom part of right panel) + self.log_view = QTextEdit() + self.log_view.setReadOnly(True) + self.log_view.setStyleSheet("font-family: Consolas, monospace; background-color: #222; color: #ddd;") + + # Add chat view and log view to the chat_log_splitter + chat_log_splitter.addWidget(chat_container) + chat_log_splitter.addWidget(self.log_view) + chat_log_splitter.setStretchFactor(0, 3) # Give chat view more space + chat_log_splitter.setStretchFactor(1, 2) + + # Add chat_log_splitter to the right side of h_splitter + h_splitter.addWidget(chat_log_splitter) + h_splitter.setStretchFactor(0, 1) # Screenshot area + h_splitter.setStretchFactor(1, 2) # Chat + log area + + # Add h_splitter to main layout + main_layout.addWidget(h_splitter) + + # Add progress indicator at the bottom (hidden by default) + self.progress_bar = QProgressBar() + self.progress_bar.setRange(0, 0) # Indeterminate mode + self.progress_bar.setVisible(False) + main_layout.addWidget(self.progress_bar) + + # Set the central widget + self.setCentralWidget(central_widget) + + # Redirect stdout to the log view + self.log_redirector = LogRedirector(self.log_view) + sys.stdout = self.log_redirector + sys.stderr = self.log_redirector + + # Add a welcome message to the chat + self.add_message("Welcome to Self-Operating Computer! What would you like done?", is_user=False) + + # Set focus to the text input + self.text_input.setFocus() + + # Check for screenshots directory and display the latest screenshot if available + screenshots_dir = "screenshots" + if os.path.exists(screenshots_dir): + screenshot_files = [f for f in os.listdir(screenshots_dir) if f.endswith('.png')] + if screenshot_files: + latest_screenshot = os.path.join(screenshots_dir, sorted(screenshot_files)[-1]) + self.screenshot_display.update_screenshot(latest_screenshot) + + def add_message(self, text, is_user=True): + """Add a message to the chat view""" + message_widget = MessageWidget(text, is_user) + self.chat_content_layout.insertWidget(self.chat_content_layout.count() - 1, message_widget) + + # Scroll to the bottom to show the new message + self.chat_scroll_area.verticalScrollBar().setValue( + self.chat_scroll_area.verticalScrollBar().maximum() + ) + + def send_message(self): + """Send a message and start processing the task""" + text = self.text_input.text().strip() + if not text: + return + + # Add the message to the chat view + self.add_message(text, is_user=True) + self.text_input.clear() + + # Start processing in a separate thread + self.process_task(text) + + def process_task(self, objective): + """Process a task in a separate thread""" + # Disable input while processing + self.text_input.setEnabled(False) + self.send_button.setEnabled(False) + self.record_button.setEnabled(False) + self.model_combo.setEnabled(False) + self.verbose_checkbox.setEnabled(False) + + # Show progress indicator + self.progress_bar.setVisible(True) + + # Get selected model and verbose setting + model = self.model_combo.currentText() + verbose = self.verbose_checkbox.isChecked() + + # Create and start the thread + self.operate_thread = OperateThread(model, objective, False, verbose) + self.operate_thread.update_signal.connect(self.update_log) + self.operate_thread.completed_signal.connect(self.task_completed) + self.operate_thread.error_signal.connect(self.handle_error) + self.operate_thread.screenshot_signal.connect(self.update_screenshot) + self.operate_thread.start() + + @pyqtSlot() + def stop_task(self): + if self.operate_thread is not None and self.operate_thread.isRunning(): + self.operate_thread.stop() # Signal the thread to stop + self.operate_thread.wait() # Wait for it to finish + self.add_message("Task stopped by the user.", is_user=False) + + # Re-enable input and hide progress indicator + self.text_input.setEnabled(True) + self.send_button.setEnabled(True) + self.record_button.setEnabled(True) + self.model_combo.setEnabled(True) + self.verbose_checkbox.setEnabled(True) + self.progress_bar.setVisible(False) + self.text_input.setFocus() + + @pyqtSlot(str) + def update_log(self, text): + """Update the log view with new text""" + print(text) + + @pyqtSlot(str) + def update_screenshot(self, filename): + """Update the screenshot display with the latest screenshot""" + self.screenshot_display.update_screenshot(filename) + + @pyqtSlot() + def task_completed(self): + """Handle task completion""" + # Add completion message to chat + self.add_message("Task completed! What would you like to do next?", is_user=False) + + # Re-enable input + self.text_input.setEnabled(True) + self.send_button.setEnabled(True) + self.model_combo.setEnabled(True) + self.verbose_checkbox.setEnabled(True) + if self.mic: + self.record_button.setEnabled(True) + + # Hide progress indicator + self.progress_bar.setVisible(False) + + # Set focus back to text input + self.text_input.setFocus() + + @pyqtSlot(str) + def handle_error(self, error_text): + """Handle errors from the operate thread""" + print(f"ERROR: {error_text}") + self.add_message(f"An error occurred: {error_text}", is_user=False) + + # Re-enable input + self.text_input.setEnabled(True) + self.send_button.setEnabled(True) + self.model_combo.setEnabled(True) + self.verbose_checkbox.setEnabled(True) + if self.mic: + self.record_button.setEnabled(True) + + # Hide progress indicator + self.progress_bar.setVisible(False) + + # Set focus back to text input + self.text_input.setFocus() + + def start_recording(self): + """Start voice recording""" + if not self.mic: + return + + self.record_thread = VoiceRecordingThread(self.mic) + self.record_thread.finished_signal.connect(self.process_voice_result) + self.record_thread.start() + + def stop_recording(self): + """Stop voice recording gracefully.""" + if hasattr(self, 'record_thread') and self.record_thread.isRunning(): + self.record_thread.stop() # signal the thread to stop + self.record_thread.wait(2000) # wait up to 2 seconds for the thread to finish + + @pyqtSlot(str) + def process_voice_result(self, result): + """Process the result from voice recognition""" + if result.startswith("Error:"): + QMessageBox.warning(self, "Voice Recognition Error", result) + return + + # Set the recognized text to the input field and send it + self.text_input.setText(result) + self.send_message() + + def closeEvent(self, event): + """Handle window close event""" + # Stop any running thread + if self.operate_thread and self.operate_thread.isRunning(): + self.operate_thread.stop() + self.operate_thread.wait() + + # Restore stdout and stderr + sys.stdout = self.log_redirector.original_stdout + sys.stderr = self.log_redirector.original_stderr + + event.accept() \ No newline at end of file diff --git a/gui_main.py b/gui_main.py new file mode 100644 index 00000000..f0154db6 --- /dev/null +++ b/gui_main.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +""" +Self-Operating Computer GUI +""" +import sys +import os +import argparse +from PyQt5.QtWidgets import QApplication + +# Add the root directory to the system path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Import after setting path +from operate.config import Config +from operate.utils.style import ANSI_BRIGHT_MAGENTA +from gui import SOCChatWindow + + +def main_entry(): + """ + Main entry point for the Self-Operating Computer GUI + """ + parser = argparse.ArgumentParser( + description="Run the Self-Operating Computer GUI with a specified model." + ) + parser.add_argument( + "-m", + "--model", + help="Specify the default model to use", + required=False, + default="gpt-4-with-ocr", + ) + + # Add a flag for verbose mode + parser.add_argument( + "--verbose", + help="Run with verbose logging", + action="store_true", + ) + + # Allow for dark or light mode + parser.add_argument( + "--light", + help="Use light mode instead of dark mode", + action="store_true", + ) + + try: + args = parser.parse_args() + + # Create Qt application + app = QApplication(sys.argv) + app.setStyle("Fusion") + + # Apply dark mode palette unless light mode is requested + if not args.light: + from PyQt5.QtGui import QPalette, QColor + from PyQt5.QtCore import Qt + + palette = QPalette() + palette.setColor(QPalette.Window, QColor(53, 53, 53)) + palette.setColor(QPalette.WindowText, Qt.white) + palette.setColor(QPalette.Base, QColor(25, 25, 25)) + palette.setColor(QPalette.AlternateBase, QColor(53, 53, 53)) + palette.setColor(QPalette.ToolTipBase, Qt.white) + palette.setColor(QPalette.ToolTipText, Qt.white) + palette.setColor(QPalette.Text, Qt.white) + palette.setColor(QPalette.Button, QColor(53, 53, 53)) + palette.setColor(QPalette.ButtonText, Qt.white) + palette.setColor(QPalette.BrightText, Qt.red) + palette.setColor(QPalette.Link, QColor(42, 130, 218)) + palette.setColor(QPalette.Highlight, QColor(42, 130, 218)) + palette.setColor(QPalette.HighlightedText, Qt.black) + app.setPalette(palette) + + # Initialize configuration + config = Config() + config.verbose = args.verbose + + # Create and show the main window + window = SOCChatWindow() + + # Set the default model based on command-line argument + model_index = window.model_combo.findText(args.model) + if model_index >= 0: + window.model_combo.setCurrentIndex(model_index) + + # Set verbose checkbox based on command-line argument + window.verbose_checkbox.setChecked(args.verbose) + + # Show the window + window.show() + + # Run the application + sys.exit(app.exec_()) + + except KeyboardInterrupt: + print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...") + except Exception as e: + print(f"Error starting GUI: {str(e)}") + sys.exit(1) + + +if __name__ == "__main__": + main_entry() \ No newline at end of file From 296ee78987b489e0759ab17b2686a62af3ea2b5b Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sun, 2 Mar 2025 21:42:19 +0100 Subject: [PATCH 30/37] Update operate.py --- operate/operate.py | 270 ++++++++++++++++++++++++++------------------- 1 file changed, 156 insertions(+), 114 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index 32717a36..18bf4d03 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -5,31 +5,9 @@ import pyautogui from prompt_toolkit.shortcuts import message_dialog from prompt_toolkit import prompt -from operate.exceptions import ModelNotRecognizedException import platform # from operate.models.prompts import USER_QUESTION, get_system_prompt -from operate.models.prompts import ( - USER_QUESTION, - get_system_prompt, -) -from operate.config import Config -from operate.utils.style import ( - ANSI_GREEN, - ANSI_RESET, - ANSI_YELLOW, - ANSI_RED, - ANSI_BRIGHT_MAGENTA, - ANSI_BLUE, - style, -) -from operate.utils.operating_system import OperatingSystem -from operate.models.apis import get_next_action - -# Load configuration -config = Config() -operating_system = OperatingSystem() - def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): """ @@ -43,93 +21,127 @@ def main(model, terminal_prompt, voice_mode=False, verbose_mode=False): Returns: None """ + from operate.config import Config + from operate.exceptions import ModelNotRecognizedException + + from operate.utils.style import ( + ANSI_GREEN, + ANSI_RESET, + ANSI_YELLOW, + ANSI_RED, + ANSI_BRIGHT_MAGENTA, + ANSI_BLUE, + style, + ) + + from operate.utils.operating_system import OperatingSystem + from operate.models.prompts import ( + USER_QUESTION, + get_system_prompt, + ) + + # Load configuration + config = Config() + operating_system = OperatingSystem() + + from operate.models.apis import get_next_action + + while True: # Add outer loop to enable restarting after completion + mic = None + # Initialize `WhisperMic`, if `voice_mode` is True + + config.verbose = verbose_mode + config.validation(model, voice_mode) + + if voice_mode: + try: + from whisper_mic import WhisperMic - mic = None - # Initialize `WhisperMic`, if `voice_mode` is True - - config.verbose = verbose_mode - config.validation(model, voice_mode) - - if voice_mode: - try: - from whisper_mic import WhisperMic - - # Initialize WhisperMic if import is successful - mic = WhisperMic() - except ImportError: + # Initialize WhisperMic if import is successful + mic = WhisperMic() + except ImportError: + print( + "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'" + ) + sys.exit(1) + + # Skip message dialog if prompt was given directly + if not terminal_prompt: + message_dialog( + title="Self-Operating Computer", + text="An experimental framework to enable multimodal models to operate computers", + style=style, + ).run() + + else: + print("Running direct prompt...") + + # # Clear the console + if platform.system() == "Windows": + os.system("cls") + else: + print("\033c", end="") + + if terminal_prompt and not hasattr(main, 'first_run_complete'): + # Only use the terminal prompt on the first iteration + objective = terminal_prompt + main.first_run_complete = True + elif voice_mode: print( - "Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'" + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" ) - sys.exit(1) - - # Skip message dialog if prompt was given directly - if not terminal_prompt: - message_dialog( - title="Self-Operating Computer", - text="An experimental framework to enable multimodal models to operate computers", - style=style, - ).run() - - else: - print("Running direct prompt...") - - # # Clear the console - if platform.system() == "Windows": - os.system("cls") - else: - print("\033c", end="") - - if terminal_prompt: # Skip objective prompt if it was given as an argument - objective = terminal_prompt - elif voice_mode: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)" - ) - try: - objective = mic.listen() - except Exception as e: - print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}") - return # Exit if voice input fails - else: - print( - f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}" - ) - print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") - objective = prompt(style=style) - - system_prompt = get_system_prompt(model, objective) - system_message = {"role": "system", "content": system_prompt} - messages = [system_message] - - loop_count = 0 - - session_id = None - - while True: - if config.verbose: - print("[Self Operating Computer] loop_count", loop_count) - try: - operations, session_id = asyncio.run( - get_next_action(model, messages, objective, session_id) + try: + objective = mic.listen() + except Exception as e: + print(f"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}") + return # Exit if voice input fails + else: + print( + f"[{ANSI_GREEN}Self-Operating Computer {ANSI_RESET}|{ANSI_BRIGHT_MAGENTA} {model}{ANSI_RESET}]\n{USER_QUESTION}" ) + print(f"{ANSI_YELLOW}[User]{ANSI_RESET}") + objective = prompt(style=style) - stop = operate(operations, session_id, model) - if stop: - break + system_prompt = get_system_prompt(model, objective) + system_message = {"role": "system", "content": system_prompt} + messages = [system_message] - loop_count += 1 - if loop_count > 10: - break - except ModelNotRecognizedException as e: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" - ) - break - except Exception as e: - print( - f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" - ) - break + loop_count = 0 + + session_id = None + + task_completed = False # Flag to indicate if the task was completed + while not task_completed: + if config.verbose: + print("[Self Operating Computer] loop_count", loop_count) + try: + operations, session_id = asyncio.run( + get_next_action(model, messages, objective, session_id) + ) + + # Instead of breaking out of the whole program, we set a flag if "done" is reached + task_completed = operate(operations, session_id, model) + + loop_count += 1 + if loop_count > 10: + task_completed = True # Force completion if loop count exceeds 10 + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_YELLOW} Max loop count reached. Moving to next task.{ANSI_RESET}") + except ModelNotRecognizedException as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" + ) + task_completed = True # Exit inner loop and start over + except Exception as e: + print( + f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}" + ) + task_completed = True # Exit inner loop and start over + + print(f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Task completed. Ready for a new task.") + if terminal_prompt: + # If the session was started with a terminal prompt, we need to clear it after the first use + terminal_prompt = None # def verify_click_target(x_percent, y_percent, target_description, client): @@ -455,15 +467,18 @@ def click_relative(x_percent, y_percent, x_divisor=1.50, y_multiplier=1.25): def operate(operations, session_id, model=None): """ Processes a list of operations and executes them. - Supports click, doubleclick, write, press, wait, and done operations. - For click operations, it uses the adjusted coordinate conversion: + Supports click, doubleclick, rightclick, scroll, write, press, wait, and done operations. + For click/doubleclick/rightclick operations, it uses the adjusted coordinate conversion: - x-coordinate divided by 1.50. - y-coordinate multiplied by 1.25. + + Returns: + bool: True if "done" operation was encountered (task completed), otherwise False """ import time for op in operations: - if op.get("operation") in ["click", "doubleclick"]: + if op.get("operation") in ["click", "doubleclick", "rightclick"]: try: x_percent = float(op.get("x", 0)) y_percent = float(op.get("y", 0)) @@ -478,20 +493,47 @@ def operate(operations, session_id, model=None): adjusted_x = int(base_x / 1.50) adjusted_y = int(base_y * 1.25) + operation_type = op.get("operation") + operation_name = { + "click": "Clicking", + "doubleclick": "Double-clicking", + "rightclick": "Right-clicking" + }.get(operation_type, operation_type) + print( - f"{'Double-clicking' if op.get('operation') == 'doubleclick' else 'Clicking'} " - f"at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen " + f"{operation_name} at ({adjusted_x}, {adjusted_y}) on a {screen_width}x{screen_height} screen " f"with scaling factor {scaling_factor}" ) - if op.get("operation") == "doubleclick": + if operation_type == "doubleclick": pyautogui.doubleClick(adjusted_x, adjusted_y) + elif operation_type == "rightclick": + pyautogui.rightClick(adjusted_x, adjusted_y) else: pyautogui.click(adjusted_x, adjusted_y) except Exception as e: - print( - f"Error performing {'double-click' if op.get('operation') == 'doubleclick' else 'click'} operation:", - e) + print(f"Error performing {op.get('operation')} operation:", e) + + elif op.get("operation") == "scroll": + try: + direction = op.get("direction", "down") + amount = int(op.get("amount", 3)) + + # Convert direction to clicks (positive for down/right, negative for up/left) + clicks = amount + if direction in ["up", "left"]: + clicks = -amount + + if direction in ["up", "down"]: + print(f"Scrolling {direction} by {amount} clicks") + pyautogui.scroll(clicks) + elif direction in ["left", "right"]: + print(f"Scrolling {direction} by {amount} clicks") + pyautogui.hscroll(clicks) + else: + print(f"Invalid scroll direction: {direction}") + except Exception as e: + print(f"Error performing scroll operation:", e) elif op.get("operation") == "write": content = op.get("content", "") @@ -508,6 +550,6 @@ def operate(operations, session_id, model=None): elif op.get("operation") == "done": print("Operation completed:", op.get("summary", "")) - return True # Stop processing further operations + return True # Signal that the task is completed - return False # Continue processing + return False # Continue processing this task From 87240336f182e357fe1879b1330908560778a445 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sun, 2 Mar 2025 21:43:02 +0100 Subject: [PATCH 31/37] Update prompts.py --- operate/models/prompts.py | 121 ++++++++++++++++++++++++++++++-------- 1 file changed, 96 insertions(+), 25 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index ad660686..197ab143 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -13,7 +13,7 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click ``` @@ -25,22 +25,32 @@ [{{ "thought": "write a thought here", "operation": "doubleclick", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format ``` -3. write - Write with your keyboard +3. rightclick - Move mouse and right click +``` +[{{ "thought": "write a thought here", "operation": "rightclick", "x": "x percent (e.g. 0.10)", "y": "y percent (e.g. 0.13)" }}] # "percent" refers to the percentage of the screen's dimensions in decimal format +``` + +4. scroll - Scroll the page up, down, left, or right +``` +[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}] +``` + +5. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -4. press - Use a hotkey or press key to operate the computer +6. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -5. done - The objective is completed +7. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` -6. wait - Wait some time for a page to load +8. wait - Wait some time for a page to load ``` [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` @@ -65,7 +75,21 @@ ] ``` -Example 3: Focuses on the address bar in a browser before typing a website +Example 3: Right-clicking to open a context menu +``` +[ + {{ "thought": "I want to open the context menu to see available options", "operation": "rightclick", "x": "0.50", "y": "0.60" }} +] +``` + +Example 4: Scrolling down a webpage +``` +[ + {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "down", "amount": "5" }} +] +``` + +Example 5: Focuses on the address bar in a browser before typing a website ``` [ {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, @@ -74,7 +98,7 @@ ] ``` -Example 4: Waits to the page to load before proceeding to interact +Example 6: Waits for the page to load before proceeding to interact ``` [ {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5"}}, @@ -96,7 +120,7 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` ``` @@ -108,22 +132,32 @@ [{{ "thought": "write a thought here", "operation": "doubleclick", "label": "~x" }}] ``` -3. write - Write with your keyboard +3. rightclick - Move mouse and right click - We labeled the clickable elements with red bounding boxes and IDs. Label IDs are in the following format with `x` being a number: `~x` +``` +[{{ "thought": "write a thought here", "operation": "rightclick", "label": "~x" }}] +``` + +4. scroll - Scroll the page up, down, left, or right +``` +[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}] +``` + +5. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -4. press - Use a hotkey or press key to operate the computer +6. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -5. done - The objective is completed +7. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` -6. wait - Wait some time for a page to load +8. wait - Wait some time for a page to load ``` [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` @@ -147,7 +181,21 @@ ] ``` -Example 3: Focuses on the address bar in a browser before typing a website +Example 3: Right-clicking to open a context menu with a labeled element +``` +[ + {{ "thought": "I want to open the context menu for this element to see available options", "operation": "rightclick", "label": "~42" }} +] +``` + +Example 4: Scrolling down a webpage +``` +[ + {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "down", "amount": "5" }} +] +``` + +Example 5: Focuses on the address bar in a browser before typing a website ``` [ {{ "thought": "I'll focus on the address bar in the browser. I can see the browser is open so this should be safe to try", "operation": "press", "keys": [{cmd_string}, "l"] }}, @@ -156,7 +204,7 @@ ] ``` -Example 4: Send a "Hello World" message in the chat +Example 6: Send a "Hello World" message in the chat ``` [ {{ "thought": "I see a messsage field on this page near the button. It looks like it has a label", "operation": "click", "label": "~34" }}, @@ -164,7 +212,7 @@ ] ``` -Example 5: Waits to the page to load before proceeding to interact +Example 7: Waits to the page to load before proceeding to interact ``` [ {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }}, @@ -186,7 +234,7 @@ From looking at the screen, the objective, and your previous actions, take the next best series of action. -You have 6 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. +You have 8 possible operation actions available to you. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. 1. click - Move mouse and click - Look for text to click. Try to find relevant text to click, but if there's nothing relevant enough you can return `"nothing to click"` for the text value and we'll try a different method. ``` @@ -198,22 +246,32 @@ [{{ "thought": "write a thought here", "operation": "doubleclick", "text": "The text in the item to double click" }}] ``` -3. write - Write with your keyboard +3. rightclick - Move mouse and right click - Look for text to right click +``` +[{{ "thought": "write a thought here", "operation": "rightclick", "text": "The text in the item to right click" }}] +``` + +4. scroll - Scroll the page up, down, left, or right +``` +[{{ "thought": "write a thought here", "operation": "scroll", "direction": "up|down|left|right", "amount": "number of 'clicks' to scroll (e.g. 3)" }}] +``` + +5. write - Write with your keyboard ``` [{{ "thought": "write a thought here", "operation": "write", "content": "text to write here" }}] ``` -4. press - Use a hotkey or press key to operate the computer +6. press - Use a hotkey or press key to operate the computer ``` [{{ "thought": "write a thought here", "operation": "press", "keys": ["keys to use"] }}] ``` -5. done - The objective is completed +7. done - The objective is completed ``` [{{ "thought": "write a thought here", "operation": "done", "summary": "summary of what was completed" }}] ``` -6. wait - Wait some time for a page to load +8. wait - Wait some time for a page to load ``` [{{ "thought": "write a thought here", "operation": "wait", "duration": "seconds to wait (e.g. 5)" }}] ``` @@ -247,7 +305,21 @@ ] ``` -Example 4: Search for someone on Linkedin when already on linkedin.com +Example 4: Right-clicking to open a context menu +``` +[ + {{ "thought": "I want to open the context menu to see available options for this item", "operation": "rightclick", "text": "my_document.txt" }} +] +``` + +Example 5: Scrolling through content +``` +[ + {{ "thought": "I need to scroll down to see more content on the page", "operation": "scroll", "direction": "down", "amount": "5" }} +] +``` + +Example 6: Search for someone on Linkedin when already on linkedin.com ``` [ {{ "thought": "I can see the search field with the placeholder text 'search'. I click that field to search", "operation": "click", "text": "search" }}, @@ -256,7 +328,7 @@ ] ``` -Example 5: Waits to the page to load before proceeding to interact +Example 7: Waits to the page to load before proceeding to interact ``` [ {{ "thought": "It looks like the page I am trying to interact with didn't load yet", "operation": "wait", "duration": "5" }}, @@ -276,17 +348,16 @@ """ OPERATE_FIRST_MESSAGE_PROMPT = """ -Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 5 operations available: click, write, press, done, wait +Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 8 operations available: click, doubleclick, rightclick, scroll, write, press, done, wait You just started so you are in the terminal app and your code is running in this terminal tab. To leave the terminal, search for a new program on the OS. Action:""" OPERATE_PROMPT = """ -Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 5 operations available: click, write, press, done, wait +Please take the next best action. The `pyautogui` library will be used to execute your decision. Your output will be used in a `json.loads` loads statement. Remember you only have the following 8 operations available: click, doubleclick, rightclick, scroll, write, press, done, wait Action:""" - def get_system_prompt(model, objective): """ Format the vision prompt more efficiently and print the name of the prompt used From bc980c2398b723266f57e800c17e964ae547010e Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Sun, 2 Mar 2025 21:45:53 +0100 Subject: [PATCH 32/37] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9e53992b..108bafa6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ome
- A framework to enable multimodal models to operate a computer. + A framework to enable multimodal models to operate a computer GUI INCLUDED and double click, right click, scroll and wait operations defined.
Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. Released Nov 2023, the Self-Operating Computer Framework was one of the first examples of using a multimodal model to view the screen and operate a computer. From 963b866f8fd14215c8461f5ce18a4612cfc4118d Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Mon, 3 Mar 2025 21:07:04 +0100 Subject: [PATCH 33/37] Update operate.py more scroll integer number so it scrolls faster --- operate/operate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operate/operate.py b/operate/operate.py index 18bf4d03..bbd7fb5b 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -517,7 +517,7 @@ def operate(operations, session_id, model=None): elif op.get("operation") == "scroll": try: direction = op.get("direction", "down") - amount = int(op.get("amount", 3)) + amount = int(op.get("amount", 25)) # Convert direction to clicks (positive for down/right, negative for up/left) clicks = amount From e1d92bbbfbde87c8d318f89510b21658487310bb Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Mon, 3 Mar 2025 21:24:15 +0100 Subject: [PATCH 34/37] Update operate.py now the scroll scrolls faster --- operate/operate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index bbd7fb5b..6dddb2ba 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -517,10 +517,10 @@ def operate(operations, session_id, model=None): elif op.get("operation") == "scroll": try: direction = op.get("direction", "down") - amount = int(op.get("amount", 25)) + amount = int(op.get("amount", 3)) # Convert direction to clicks (positive for down/right, negative for up/left) - clicks = amount + clicks = amount * 15 if direction in ["up", "left"]: clicks = -amount From fa9f9f5b5b7326783648eabd52ccd4fcfe639254 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Mon, 3 Mar 2025 21:49:50 +0100 Subject: [PATCH 35/37] Update operate.py had to redo some lines for the scroll to work well --- operate/operate.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index 6dddb2ba..f279ab93 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -516,20 +516,26 @@ def operate(operations, session_id, model=None): elif op.get("operation") == "scroll": try: - direction = op.get("direction", "down") + + direction = op.get("direction") amount = int(op.get("amount", 3)) # Convert direction to clicks (positive for down/right, negative for up/left) - clicks = amount * 15 + + if direction in ["down", "right"]: + clicks = amount * 150 + if direction in ["up", "left"]: - clicks = -amount + clicks = -amount * 150 if direction in ["up", "down"]: print(f"Scrolling {direction} by {amount} clicks") pyautogui.scroll(clicks) + elif direction in ["left", "right"]: print(f"Scrolling {direction} by {amount} clicks") pyautogui.hscroll(clicks) + else: print(f"Invalid scroll direction: {direction}") except Exception as e: From 0ce1ff3844c4d72d02c45e0fc692a324bd68256c Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Mon, 3 Mar 2025 22:05:47 +0100 Subject: [PATCH 36/37] Update operate.py sorry, better now --- operate/operate.py | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/operate/operate.py b/operate/operate.py index f279ab93..fb84c3e1 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -514,32 +514,58 @@ def operate(operations, session_id, model=None): except Exception as e: print(f"Error performing {op.get('operation')} operation:", e) + elif op.get("operation") == "scroll": + try: - direction = op.get("direction") - amount = int(op.get("amount", 3)) + direction = op.get("direction", "") + + amount = int(op.get("amount", 0)) + + # For vertical scrolling: positive for up, negative for down - # Convert direction to clicks (positive for down/right, negative for up/left) + if direction == "up": - if direction in ["down", "right"]: clicks = amount * 150 - if direction in ["up", "left"]: + elif direction == "down": + clicks = -amount * 150 + # For horizontal scrolling: negative for left, positive for right + + elif direction == "left": + + clicks = -amount * 150 + + elif direction == "right": + + clicks = amount * 150 + + else: + + print(f"Invalid scroll direction: {direction}") + + clicks = 0 + + # Execute scroll based on direction type + if direction in ["up", "down"]: + print(f"Scrolling {direction} by {amount} clicks") + pyautogui.scroll(clicks) elif direction in ["left", "right"]: + print(f"Scrolling {direction} by {amount} clicks") + pyautogui.hscroll(clicks) - else: - print(f"Invalid scroll direction: {direction}") except Exception as e: - print(f"Error performing scroll operation:", e) + + print("Error performing scroll operation:", e) elif op.get("operation") == "write": content = op.get("content", "") From 3a4d0d9ecdfe3c50b9008398ba87ef391fabe794 Mon Sep 17 00:00:00 2001 From: Koolkatze <120437451+Koolkatze@users.noreply.github.com> Date: Mon, 3 Mar 2025 22:10:05 +0100 Subject: [PATCH 37/37] Update prompts.py changes in scroll prompt to suite better our needs --- operate/models/prompts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operate/models/prompts.py b/operate/models/prompts.py index 197ab143..c3b4e2ad 100644 --- a/operate/models/prompts.py +++ b/operate/models/prompts.py @@ -85,7 +85,7 @@ Example 4: Scrolling down a webpage ``` [ - {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "down", "amount": "5" }} + {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }} ] ``` @@ -191,7 +191,7 @@ Example 4: Scrolling down a webpage ``` [ - {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "down", "amount": "5" }} + {{ "thought": "I need to scroll down to see more content", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }} ] ``` @@ -315,7 +315,7 @@ Example 5: Scrolling through content ``` [ - {{ "thought": "I need to scroll down to see more content on the page", "operation": "scroll", "direction": "down", "amount": "5" }} + {{ "thought": "I need to scroll down to see more content on the page", "operation": "scroll", "direction": "up|down|left|right", "amount": "5" }} ] ```