Add gpt-4.1

joshbickett · joshbickett · commit 9a38c7bb25a2 · 2025-05-13T15:11:33.000-07:00
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ ome
 
 ## Key Features
 - **Compatibility**: Designed for various multimodal models.
-- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3, Qwen-VL and LLaVa.**
+- **Integration**: Currently integrated with **GPT-4o, GPT-4.1, o1, Gemini Pro Vision, Claude 3, Qwen-VL and LLaVa.**
 - **Future Plans**: Support for additional models.
 
 ## Demo
@@ -54,12 +54,18 @@ operate
 
 #### OpenAI models
 
-The default model for the project is gpt-4o which you can use by simply typing `operate`. To try running OpenAI's new `o1` model, use the command below. 
+The default model for the project is gpt-4o which you can use by simply typing `operate`. To try running OpenAI's new `o1` model, use the command below.
 
 ```
 operate -m o1-with-ocr
 ```
 
+To experiment with OpenAI's latest `gpt-4.1` model, run:
+
+```
+operate -m gpt-4.1-with-ocr
+```
+
 
 ### Multimodal Models  `-m`
 Try Google's `gemini-pro-vision` by following the instructions below. Start `operate` with the Gemini model
diff --git a/operate/config.py b/operate/config.py
@@ -139,6 +139,7 @@ def validation(self, model, voice_mode):
             or voice_mode
             or model == "gpt-4-with-som"
             or model == "gpt-4-with-ocr"
+            or model == "gpt-4.1-with-ocr"
             or model == "o1-with-ocr",
         )
         self.require_api_key(
diff --git a/operate/models/apis.py b/operate/models/apis.py
@@ -46,6 +46,9 @@ async def get_next_action(model, messages, objective, session_id):
     if model == "gpt-4-with-ocr":
         operation = await call_gpt_4o_with_ocr(messages, objective, model)
         return operation, None
+    if model == "gpt-4.1-with-ocr":
+        operation = await call_gpt_4_1_with_ocr(messages, objective, model)
+        return operation, None
     if model == "o1-with-ocr":
         operation = await call_o1_with_ocr(messages, objective, model)
         return operation, None
@@ -421,6 +424,112 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
         return gpt_4_fallback(messages, objective, model)
 
 
+async def call_gpt_4_1_with_ocr(messages, objective, model):
+    if config.verbose:
+        print("[call_gpt_4_1_with_ocr]")
+
+    try:
+        time.sleep(1)
+        client = config.initialize_openai()
+
+        confirm_system_prompt(messages, objective, model)
+        screenshots_dir = "screenshots"
+        if not os.path.exists(screenshots_dir):
+            os.makedirs(screenshots_dir)
+
+        screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
+        capture_screen_with_cursor(screenshot_filename)
+
+        with open(screenshot_filename, "rb") as img_file:
+            img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+
+        if len(messages) == 1:
+            user_prompt = get_user_first_message_prompt()
+        else:
+            user_prompt = get_user_prompt()
+
+        vision_message = {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
+                },
+            ],
+        }
+        messages.append(vision_message)
+
+        response = client.chat.completions.create(
+            model="gpt-4.1",
+            messages=messages,
+        )
+
+        content = response.choices[0].message.content
+
+        content = clean_json(content)
+
+        content_str = content
+
+        content = json.loads(content)
+
+        processed_content = []
+
+        for operation in content:
+            if operation.get("operation") == "click":
+                text_to_click = operation.get("text")
+                if config.verbose:
+                    print(
+                        "[call_gpt_4_1_with_ocr][click] text_to_click",
+                        text_to_click,
+                    )
+                reader = easyocr.Reader(["en"])
+
+                result = reader.readtext(screenshot_filename)
+
+                text_element_index = get_text_element(
+                    result, text_to_click, screenshot_filename
+                )
+                coordinates = get_text_coordinates(
+                    result, text_element_index, screenshot_filename
+                )
+
+                operation["x"] = coordinates["x"]
+                operation["y"] = coordinates["y"]
+
+                if config.verbose:
+                    print(
+                        "[call_gpt_4_1_with_ocr][click] text_element_index",
+                        text_element_index,
+                    )
+                    print(
+                        "[call_gpt_4_1_with_ocr][click] coordinates",
+                        coordinates,
+                    )
+                    print(
+                        "[call_gpt_4_1_with_ocr][click] final operation",
+                        operation,
+                    )
+                processed_content.append(operation)
+
+            else:
+                processed_content.append(operation)
+
+        assistant_message = {"role": "assistant", "content": content_str}
+        messages.append(assistant_message)
+
+        return processed_content
+
+    except Exception as e:
+        print(
+            f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
+        )
+        if config.verbose:
+            print("[Self-Operating Computer][Operate] error", e)
+            traceback.print_exc()
+        return gpt_4_fallback(messages, objective, model)
+
+
 async def call_o1_with_ocr(messages, objective, model):
     if config.verbose:
         print("[call_o1_with_ocr]")
diff --git a/operate/models/prompts.py b/operate/models/prompts.py
@@ -232,7 +232,7 @@ def get_system_prompt(model, objective):
             os_search_str=os_search_str,
             operating_system=operating_system,
         )
-    elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
+    elif model == "gpt-4-with-ocr" or model == "gpt-4.1-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
 
         prompt = SYSTEM_PROMPT_OCR.format(
             objective=objective,

Original file line number	Diff line number	Diff line change
`@@ -139,6 +139,7 @@ def validation(self, model, voice_mode):`
`139`	`139`	`or voice_mode`
`140`	`140`	`or model == "gpt-4-with-som"`
`141`	`141`	`or model == "gpt-4-with-ocr"`
	`142`	`+ or model == "gpt-4.1-with-ocr"`
`142`	`143`	`or model == "o1-with-ocr",`
`143`	`144`	`)`
`144`	`145`	`self.require_api_key(`
Original file line number	Diff line number	Diff line change
`@@ -232,7 +232,7 @@ def get_system_prompt(model, objective):`
`232`	`232`	`os_search_str=os_search_str,`
`233`	`233`	`operating_system=operating_system,`
`234`	`234`	`)`
`235`		`- elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":`
	`235`	`+ elif model == "gpt-4-with-ocr" or model == "gpt-4.1-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":`
`236`	`236`
`237`	`237`	`prompt = SYSTEM_PROMPT_OCR.format(`
`238`	`238`	`objective=objective,`