Skip to content

Commit 9a38c7b

Browse files
committed
Add gpt-4.1
1 parent de256f5 commit 9a38c7b

File tree

4 files changed

+119
-3
lines changed

4 files changed

+119
-3
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ ome
2020

2121
## Key Features
2222
- **Compatibility**: Designed for various multimodal models.
23-
- **Integration**: Currently integrated with **GPT-4o, o1, Gemini Pro Vision, Claude 3, Qwen-VL and LLaVa.**
23+
- **Integration**: Currently integrated with **GPT-4o, GPT-4.1, o1, Gemini Pro Vision, Claude 3, Qwen-VL and LLaVa.**
2424
- **Future Plans**: Support for additional models.
2525

2626
## Demo
@@ -54,12 +54,18 @@ operate
5454

5555
#### OpenAI models
5656

57-
The default model for the project is gpt-4o which you can use by simply typing `operate`. To try running OpenAI's new `o1` model, use the command below.
57+
The default model for the project is gpt-4o which you can use by simply typing `operate`. To try running OpenAI's new `o1` model, use the command below.
5858

5959
```
6060
operate -m o1-with-ocr
6161
```
6262

63+
To experiment with OpenAI's latest `gpt-4.1` model, run:
64+
65+
```
66+
operate -m gpt-4.1-with-ocr
67+
```
68+
6369

6470
### Multimodal Models `-m`
6571
Try Google's `gemini-pro-vision` by following the instructions below. Start `operate` with the Gemini model

operate/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ def validation(self, model, voice_mode):
139139
or voice_mode
140140
or model == "gpt-4-with-som"
141141
or model == "gpt-4-with-ocr"
142+
or model == "gpt-4.1-with-ocr"
142143
or model == "o1-with-ocr",
143144
)
144145
self.require_api_key(

operate/models/apis.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ async def get_next_action(model, messages, objective, session_id):
4646
if model == "gpt-4-with-ocr":
4747
operation = await call_gpt_4o_with_ocr(messages, objective, model)
4848
return operation, None
49+
if model == "gpt-4.1-with-ocr":
50+
operation = await call_gpt_4_1_with_ocr(messages, objective, model)
51+
return operation, None
4952
if model == "o1-with-ocr":
5053
operation = await call_o1_with_ocr(messages, objective, model)
5154
return operation, None
@@ -421,6 +424,112 @@ async def call_gpt_4o_with_ocr(messages, objective, model):
421424
return gpt_4_fallback(messages, objective, model)
422425

423426

427+
async def call_gpt_4_1_with_ocr(messages, objective, model):
428+
if config.verbose:
429+
print("[call_gpt_4_1_with_ocr]")
430+
431+
try:
432+
time.sleep(1)
433+
client = config.initialize_openai()
434+
435+
confirm_system_prompt(messages, objective, model)
436+
screenshots_dir = "screenshots"
437+
if not os.path.exists(screenshots_dir):
438+
os.makedirs(screenshots_dir)
439+
440+
screenshot_filename = os.path.join(screenshots_dir, "screenshot.png")
441+
capture_screen_with_cursor(screenshot_filename)
442+
443+
with open(screenshot_filename, "rb") as img_file:
444+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
445+
446+
if len(messages) == 1:
447+
user_prompt = get_user_first_message_prompt()
448+
else:
449+
user_prompt = get_user_prompt()
450+
451+
vision_message = {
452+
"role": "user",
453+
"content": [
454+
{"type": "text", "text": user_prompt},
455+
{
456+
"type": "image_url",
457+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
458+
},
459+
],
460+
}
461+
messages.append(vision_message)
462+
463+
response = client.chat.completions.create(
464+
model="gpt-4.1",
465+
messages=messages,
466+
)
467+
468+
content = response.choices[0].message.content
469+
470+
content = clean_json(content)
471+
472+
content_str = content
473+
474+
content = json.loads(content)
475+
476+
processed_content = []
477+
478+
for operation in content:
479+
if operation.get("operation") == "click":
480+
text_to_click = operation.get("text")
481+
if config.verbose:
482+
print(
483+
"[call_gpt_4_1_with_ocr][click] text_to_click",
484+
text_to_click,
485+
)
486+
reader = easyocr.Reader(["en"])
487+
488+
result = reader.readtext(screenshot_filename)
489+
490+
text_element_index = get_text_element(
491+
result, text_to_click, screenshot_filename
492+
)
493+
coordinates = get_text_coordinates(
494+
result, text_element_index, screenshot_filename
495+
)
496+
497+
operation["x"] = coordinates["x"]
498+
operation["y"] = coordinates["y"]
499+
500+
if config.verbose:
501+
print(
502+
"[call_gpt_4_1_with_ocr][click] text_element_index",
503+
text_element_index,
504+
)
505+
print(
506+
"[call_gpt_4_1_with_ocr][click] coordinates",
507+
coordinates,
508+
)
509+
print(
510+
"[call_gpt_4_1_with_ocr][click] final operation",
511+
operation,
512+
)
513+
processed_content.append(operation)
514+
515+
else:
516+
processed_content.append(operation)
517+
518+
assistant_message = {"role": "assistant", "content": content_str}
519+
messages.append(assistant_message)
520+
521+
return processed_content
522+
523+
except Exception as e:
524+
print(
525+
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA}[{model}] That did not work. Trying another method {ANSI_RESET}"
526+
)
527+
if config.verbose:
528+
print("[Self-Operating Computer][Operate] error", e)
529+
traceback.print_exc()
530+
return gpt_4_fallback(messages, objective, model)
531+
532+
424533
async def call_o1_with_ocr(messages, objective, model):
425534
if config.verbose:
426535
print("[call_o1_with_ocr]")

operate/models/prompts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def get_system_prompt(model, objective):
232232
os_search_str=os_search_str,
233233
operating_system=operating_system,
234234
)
235-
elif model == "gpt-4-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
235+
elif model == "gpt-4-with-ocr" or model == "gpt-4.1-with-ocr" or model == "o1-with-ocr" or model == "claude-3" or model == "qwen-vl":
236236

237237
prompt = SYSTEM_PROMPT_OCR.format(
238238
objective=objective,

0 commit comments

Comments
 (0)