Skip to content

Create a search based document for self-operating-computer #126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .fdignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
docs
.*
*/.*
*.pt
CONTRIBUTING.md
LICENSE
6 changes: 6 additions & 0 deletions docs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
!.gitignore
!*
!*/*
cache_db.json
cache_tree.json
vector_cache
1 change: 1 addition & 0 deletions docs/cache_title.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"_default": {"1": {"path": "/README.md", "hash": "dac3f21a1420b7c2574d16d3ae4f9a61", "title": "Self-Operating Computer Framework: Enhanced Mouse Predictions"}, "2": {"path": "/README.md:1-26", "hash": "bf2a6dd5d2e82ad184ac813bceea7aa1", "title": "Human-Like Computer Control Framework"}, "3": {"path": "/README.md:26-37", "hash": "cb2a33c5f14dbf8102a2060e59e818ce", "title": "Agent-1-Vision Model Overview"}, "4": {"path": "/README.md:37-67", "hash": "59a4d0bd0271be6882794c87dff29f6d", "title": "Improving Mouse Click Accuracy"}, "5": {"path": "/README.md:67-88", "hash": "d85bdbd6d1fcbc9f4ee97480cdead828", "title": "Install, Configure, and Operate: A Comprehensive Guide"}, "6": {"path": "/README.md:89-124", "hash": "ced841634d02dfca9565e18d4b5f2212", "title": "Installing SOCF and GMPV"}, "7": {"path": "/README.md:126-159", "hash": "ee4a2e9e1111e43b1d54c5872d0bee86", "title": "Enable Voice Mode in Self-Operating-Computer Framework"}, "8": {"path": "/README.md:159-172", "hash": "3f8661eaa636ae2a580706d495e189af", "title": "Join HyperWriteAI Discord, Visit #self-operating-computer, Gpt4Vision Model, API Credits Required"}, "9": {"path": "/evaluate.py", "hash": "f145aa4f21a3a71ba002513c87ab1299", "title": "Vision Model Image Evaluation"}, "10": {"path": "/evaluate.py:1-31", "hash": "7fba16836a3eb74270a4399e2b98e84c", "title": "Setting Up Evaluation Test Cases"}, "11": {"path": "/evaluate.py:32-73", "hash": "12d143dc6a38c079f66d6902f399af9b", "title": "ANSI Colors for Terminal Support Detection"}, "12": {"path": "/evaluate.py:75-105", "hash": "46c5cb34fa0a118542f82f10ba3948dd", "title": "Evaluate Summary Screenshot: GPT-4 Vision Model Integration"}, "13": {"path": "/evaluate.py:106-140", "hash": "c4daefc6cdbefdb544c912a83a0ad0f7", "title": "Test Evaluation and Display"}, "14": {"path": "/evaluate.py:141-150", "hash": "d45d9da226c621c8a4eb859326a6ab88", "title": "Test Result Display"}, "15": {"path": "/operate/actions.py", "hash": "57f5b1b6ff8ead6b437d05c3313f9121", "title": "AI-Powered Content Generation"}, "16": {"path": "/operate/actions.py:1-51", "hash": "ed23324e3160350d696973ecf4bbb214", "title": "Action Prediction Model"}, "17": {"path": "/operate/actions.py:52-83", "hash": "bcba7ff32d3526dae272ae4fc448b6fd", "title": "Dynamic Model Caller with Screenshot Capture"}, "18": {"path": "/operate/actions.py:84-115", "hash": "5c6c9f315ae86f4060161b43b7788ccb", "title": "Vision AI Message Encoder"}, "19": {"path": "/operate/actions.py:116-153", "hash": "5cec49a0aa53bbc39d191766c9c25bbd", "title": "Grid Overlay Screenshot Capture"}, "20": {"path": "/operate/actions.py:154-189", "hash": "eb4ce710e8a17943f56f63df8daf74f7", "title": "Screenshot-to-Message AI Model"}, "21": {"path": "/operate/actions.py:189-215", "hash": "64f175114443c2087567cebeb04b2bd0", "title": "Cursor-Guided AI Prompt Enhancement"}, "22": {"path": "/operate/actions.py:217-248", "hash": "41c0cb7cb8d7e4d80238360788b3f389", "title": "GPT-4 Vision Prompt Creation"}, "23": {"path": "/operate/actions.py:249-275", "hash": "d580b3022b3e2f2bf0afa8deb0aacb3f", "title": "Encoding Image for AI Model Generation"}, "24": {"path": "/operate/actions.py:276-305", "hash": "73fcf6c33350aeb75f0963012c4eaab2", "title": "Desktop Screenshot Labeling with GPT-4"}, "25": {"path": "/operate/actions.py:307-338", "hash": "ba4efe6d22d1b5a81e6fc8c269f290f3", "title": "Labeled Click and Decision Prompt System"}, "26": {"path": "/operate/actions.py:340-364", "hash": "f563a4a35fe4d0789d923d3cc416c88b", "title": "API Click Position Calculator"}, "27": {"path": "/operate/actions.py:365-387", "hash": "55af95f96b83ca55161b18675bfdcb41", "title": "Click Position Handler"}, "28": {"path": "/operate/actions.py:390-409", "hash": "b6a17c7d0d9473c988fffa40c8d26292", "title": "Fetch OpenAI Chat Completion Asynchronously"}, "29": {"path": "/operate/dialog.py", "hash": "3a2c6d26ce9740e42a7de1536dc6d86b", "title": "Error-Handling User Input in Dialog Operations"}, "30": {"path": "/operate/dialog.py:1-44", "hash": "112a510a9d2a67eafa1d20fe132fa998", "title": "Self-Operating Computer Response Model"}, "31": {"path": "/operate/dialog.py:46-80", "hash": "0d02cf7bfd7d05d1cfc09b37f05a28e8", "title": "Voice Mode and WhisperMic Initialization"}, "32": {"path": "/operate/dialog.py:81-109", "hash": "724810205610934c0b0c7616c9463f47", "title": "Capturing and Processing Voice Inputs"}, "33": {"path": "/operate/dialog.py:110-139", "hash": "2cc50bd85bbb75f03a7a31af6e07c334", "title": "Exception Handling and Action Execution"}, "34": {"path": "/operate/dialog.py:140-171", "hash": "47adabed891a2dfe509d8ad922d7632d", "title": "Action Type Check and Process"}, "35": {"path": "/operate/dialog.py:173-192", "hash": "439961cd470aba1b1e2b03b1ede32d22", "title": "Invalid Input Check and Error Message"}, "36": {"path": "/operate/exceptions.py", "hash": "1cb75cc9cca07c7083349d7687a89fb8", "title": "ModelRecognitionException"}, "37": {"path": "/operate/main.py", "hash": "849cb89bd135d98c287b28a0f59c5927", "title": "Main Entry Point for Self-Operating Computer"}, "38": {"path": "/operate/prompts.py", "hash": "d32925518a57e2532aa7e75757271c19", "title": "Context-Based Prompts for AI-Assisted Google Tools"}, "39": {"path": "/operate/prompts.py:1-33", "hash": "7fe3f3bf9b32af85235d3603b2f28e5f", "title": "Config Settings and Constants in Prompts Module"}, "40": {"path": "/operate/prompts.py:33-63", "hash": "f611be9877035ae65a35db64d7d0f56d", "title": "Interacting with Computers: Tips and Tricks"}, "41": {"path": "/operate/prompts.py:64-82", "hash": "02b8ebf54b5403195e7691775618cfb2", "title": "Cursor Position Prompt"}, "42": {"path": "/operate/prompts.py:82-95", "hash": "5b04c5da962c16bb3d894a23c99edd33", "title": "Guessing Percentages: CLICK Refinement"}, "43": {"path": "/operate/prompts.py:97-135", "hash": "255de85671982a6556ee9c614899ba58", "title": "Interactive Prompts for Efficient Tasks"}, "44": {"path": "/operate/prompts.py:136-159", "hash": "7496511f08f1a40eba955da343b4468c", "title": "AI-Assisted Web Interaction with Labeled Elements"}, "45": {"path": "/operate/prompts.py:161-183", "hash": "03228cf2727dcf901bfcae0a041eda8d", "title": "Contextual JSON Responses"}, "46": {"path": "/operate/prompts.py:185-217", "hash": "4b30adfe9f79f34b42e36eecd0f15b37", "title": "Prompt Formatting Functions"}, "47": {"path": "/operate/prompts.py:218-252", "hash": "832911cb60d101ae0735391a41cfe68c", "title": "Python Prompt Formatting Functions"}, "48": {"path": "/operate/settings.py", "hash": "c2e2734a3eeaee07ea071c3c86ff296a", "title": "Environment Configurations in Settings.py"}, "49": {"path": "/operate/settings.py:1-36", "hash": "10e6b21a56a6acb9ae0b10209c8e1fe1", "title": "Configuration Manager for Settings"}, "50": {"path": "/operate/settings.py:37-39", "hash": "34cb237d1b8fa9362011feddf47f20ab", "title": "Set OpenAI API URL with Env Var or Current Value"}, "51": {"path": "/operate/utils/label.py", "hash": "93c44858f4f65217acb2e06a65501930", "title": "Image Processing Utilities"}, "52": {"path": "/operate/utils/label.py:1-37", "hash": "f3323c934b39bcb21971b31d84675f81", "title": "Validate and Retrieve Image Data Functions"}, "53": {"path": "/operate/utils/label.py:40-72", "hash": "2201ff78fb8acd549396411af837f0b0", "title": "Box Overlap Detection and Labeling Functionality"}, "54": {"path": "/operate/utils/label.py:74-101", "hash": "8eb2f932380afd002c8d98174c7497af", "title": "Bounding Box Labeler"}, "55": {"path": "/operate/utils/label.py:102-128", "hash": "24eee879a8765882830fd677207ae500", "title": "Timestamped Image Saving"}, "56": {"path": "/operate/utils/label.py:129-152", "hash": "260012da282588d457f8ea5cc9ceeba7", "title": "Encode Labeled Image in Base64"}, "57": {"path": "/operate/utils/label.py:153-180", "hash": "27ea5f0021a17c4766060c44796992ae", "title": "Triple Backticks Remover & Click Percentage Calculator"}, "58": {"path": "/operate/utils/label.py:182-182", "hash": "2e179942d46f4e6ce7816daa63e99fc7", "title": "Compute Label Percentages"}, "59": {"path": "/operate/utils/misc.py", "hash": "12e996f03cce6e1223030105b6233bdb", "title": "Multifunctional Data Processor"}, "60": {"path": "/operate/utils/misc.py:1-41", "hash": "03abb4876fb8d66ba0e8bcb57fd4f0b5", "title": "Converting and Extracting: Misc.py Functions"}, "61": {"path": "/operate/utils/misc.py:43-74", "hash": "cb7527a4c506c996027a746a68d80cf9", "title": "Parse JSON Response"}, "62": {"path": "/operate/utils/misc.py:75-97", "hash": "cd0376658303f89fd0b2dd6a6f5b61f2", "title": "Response Parser and Classifier"}, "63": {"path": "/operate/utils/misc.py:98-102", "hash": "9b115deb565ae36006d792cc05f1e7c7", "title": "Handling Regex Exceptions in Search Data"}, "64": {"path": "/operate/utils/os.py", "hash": "bc30bc8244012b9c984674646e76944a", "title": "Circular Motion and Text Input Utility"}, "65": {"path": "/operate/utils/os.py:1-44", "hash": "effa7b963a4543f2dfe1cdfac15d1427", "title": "OS Utilities"}, "66": {"path": "/operate/utils/os.py:46-85", "hash": "9ca4ebadb3b3962ae2ede57ae4293792", "title": "Automated OS Interaction Utility"}, "67": {"path": "/operate/utils/os.py:85-105", "hash": "5d1e4e0c093ebfdaf7b03293a6a99c95", "title": "Circular Cursor Clicker"}, "68": {"path": "/operate/utils/os.py:107-131", "hash": "4f70563f895d92d3c7b8abf253f6cc41", "title": "Circular Movement Function and Assistant Message Retrieval"}, "69": {"path": "/operate/utils/screenshot.py", "hash": "90ce939cd173961e5a08c59cbe69dc8f", "title": "Screenshot Capture Utilities"}, "70": {"path": "/operate/utils/screenshot.py:1-39", "hash": "2ed1a42e53589686c334755d1dfff9cf", "title": "Grid Image Overlay"}, "71": {"path": "/operate/utils/screenshot.py:41-63", "hash": "5d94a4c0eb915db6948229b3a80bdff4", "title": "Background Rectangle and Grid Lines Generator"}, "72": {"path": "/operate/utils/screenshot.py:64-92", "hash": "176575b89de4763ec7a3aa300ba0b82f", "title": "Grid Screenshot Labeler"}, "73": {"path": "/operate/utils/screenshot.py:93-114", "hash": "c1de3d2bb95ef77b9721bf4ce2fd8368", "title": "Screenshot Capture Utility"}, "74": {"path": "/operate/utils/screenshot.py:115-143", "hash": "3d7b020d28a3fe6b44899e4522a6e863", "title": "Cross-Platform Screenshot Capture"}, "75": {"path": "/operate/utils/screenshot.py:144-178", "hash": "5de34d85cbb8ccc24e0cf57280d163e9", "title": "Cross-Platform Screenshot Capture Utility"}, "76": {"path": "/operate/utils/screenshot.py:179-182", "hash": "9b2abc85c0106daaaefac2325b90fc7c", "title": "Cross-Platform Screenshot and Cursor Capture Utility"}, "77": {"path": "/operate/utils/style.py", "hash": "ed587651bbe7c27ddabee832a16b492c", "title": "UI Style Configuration with PromptStyle"}, "78": {"path": "/operate/utils/style.py:1-34", "hash": "b786525e0df1320220692df064285b2d", "title": "Dialog and UI Styles with PromptStyle"}, "79": {"path": "/operate/utils/style.py:35-36", "hash": "723e74b69583b7713592415d8e097dd2", "title": "Detect Terminal Color Capabilities"}, "80": {"path": "/requirements-audio.txt", "hash": "d7f6b350ada5f0d2fa77095943fa5c98", "title": "Whisper Mic Requirements"}, "81": {"path": "/requirements.txt", "hash": "5ba31a8c2dca3df2b8fb5fe5075416b7", "title": "Python Packages for Project"}, "82": {"path": "/requirements.txt:1-50", "hash": "715045b6e5b276aadf63374adbfbbde7", "title": "Python Package Dependencies List"}, "83": {"path": "/requirements.txt:51-52", "hash": "45bfdb73c9749e654fa2b1fe17dcaab3", "title": "Project Libraries: aiohttp, ultralytics"}, "84": {"path": "/run.sh", "hash": "f6ba03ba77cee9c964f4a03260c51a27", "title": "SOC Linux Install Script"}, "85": {"path": "/run.sh:1-48", "hash": "e5cedb1ea200309d31978d3d03e934b6", "title": "SOC Linux Installation Script"}, "86": {"path": "/run.sh:49-71", "hash": "e2aed4e34a68f996aa2843cf406bdfa2", "title": "Universal Software Installer"}, "87": {"path": "/run.sh:72-115", "hash": "5d1ba14311b8a89212234ce790f61f87", "title": "Automating Python Project Setup"}, "88": {"path": "/run.sh:117-143", "hash": "631882f4968344cbcbd906fb3007ed88", "title": "OpenAI API Key Configurator"}, "89": {"path": "/run.sh:144-155", "hash": "4742d4a7a0b934a691c111136ae81e7f", "title": "MacOS Installation Check"}, "90": {"path": "/setup.py", "hash": "30346e34e45eb4025e616bfdba88d87b", "title": "Setting up 'self-operating-computer'"}}}
669 changes: 669 additions & 0 deletions docs/codeview.html

Large diffs are not rendered by default.

544 changes: 544 additions & 0 deletions docs/data/0.json

Large diffs are not rendered by default.

477 changes: 477 additions & 0 deletions docs/data/1.json

Large diffs are not rendered by default.

92 changes: 92 additions & 0 deletions docs/data/titles/0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"/README.md": "Self-Operating Computer Framework: Enhanced Mouse Predictions",
"/README.md:1-26": "Human-Like Computer Control Framework",
"/README.md:126-159": "Enable Voice Mode in Self-Operating-Computer Framework",
"/README.md:159-172": "Join HyperWriteAI Discord, Visit #self-operating-computer, Gpt4Vision Model, API Credits Required",
"/README.md:26-37": "Agent-1-Vision Model Overview",
"/README.md:37-67": "Improving Mouse Click Accuracy",
"/README.md:67-88": "Install, Configure, and Operate: A Comprehensive Guide",
"/README.md:89-124": "Installing SOCF and GMPV",
"/evaluate.py": "Vision Model Image Evaluation",
"/evaluate.py:1-31": "Setting Up Evaluation Test Cases",
"/evaluate.py:106-140": "Test Evaluation and Display",
"/evaluate.py:141-150": "Test Result Display",
"/evaluate.py:32-73": "ANSI Colors for Terminal Support Detection",
"/evaluate.py:75-105": "Evaluate Summary Screenshot: GPT-4 Vision Model Integration",
"/operate/actions.py": "AI-Powered Content Generation",
"/operate/actions.py:1-51": "Action Prediction Model",
"/operate/actions.py:116-153": "Grid Overlay Screenshot Capture",
"/operate/actions.py:154-189": "Screenshot-to-Message AI Model",
"/operate/actions.py:189-215": "Cursor-Guided AI Prompt Enhancement",
"/operate/actions.py:217-248": "GPT-4 Vision Prompt Creation",
"/operate/actions.py:249-275": "Encoding Image for AI Model Generation",
"/operate/actions.py:276-305": "Desktop Screenshot Labeling with GPT-4",
"/operate/actions.py:307-338": "Labeled Click and Decision Prompt System",
"/operate/actions.py:340-364": "API Click Position Calculator",
"/operate/actions.py:365-387": "Click Position Handler",
"/operate/actions.py:390-409": "Fetch OpenAI Chat Completion Asynchronously",
"/operate/actions.py:52-83": "Dynamic Model Caller with Screenshot Capture",
"/operate/actions.py:84-115": "Vision AI Message Encoder",
"/operate/dialog.py": "Error-Handling User Input in Dialog Operations",
"/operate/dialog.py:1-44": "Self-Operating Computer Response Model",
"/operate/dialog.py:110-139": "Exception Handling and Action Execution",
"/operate/dialog.py:140-171": "Action Type Check and Process",
"/operate/dialog.py:173-192": "Invalid Input Check and Error Message",
"/operate/dialog.py:46-80": "Voice Mode and WhisperMic Initialization",
"/operate/dialog.py:81-109": "Capturing and Processing Voice Inputs",
"/operate/exceptions.py": "ModelRecognitionException",
"/operate/main.py": "Main Entry Point for Self-Operating Computer",
"/operate/prompts.py": "Context-Based Prompts for AI-Assisted Google Tools",
"/operate/prompts.py:1-33": "Config Settings and Constants in Prompts Module",
"/operate/prompts.py:136-159": "AI-Assisted Web Interaction with Labeled Elements",
"/operate/prompts.py:161-183": "Contextual JSON Responses",
"/operate/prompts.py:185-217": "Prompt Formatting Functions",
"/operate/prompts.py:218-252": "Python Prompt Formatting Functions",
"/operate/prompts.py:33-63": "Interacting with Computers: Tips and Tricks",
"/operate/prompts.py:64-82": "Cursor Position Prompt",
"/operate/prompts.py:82-95": "Guessing Percentages: CLICK Refinement",
"/operate/prompts.py:97-135": "Interactive Prompts for Efficient Tasks",
"/operate/settings.py": "Environment Configurations in Settings.py",
"/operate/settings.py:1-36": "Configuration Manager for Settings",
"/operate/settings.py:37-39": "Set OpenAI API URL with Env Var or Current Value",
"/operate/utils/label.py": "Image Processing Utilities",
"/operate/utils/label.py:1-37": "Validate and Retrieve Image Data Functions",
"/operate/utils/label.py:102-128": "Timestamped Image Saving",
"/operate/utils/label.py:129-152": "Encode Labeled Image in Base64",
"/operate/utils/label.py:153-180": "Triple Backticks Remover & Click Percentage Calculator",
"/operate/utils/label.py:182-182": "Compute Label Percentages",
"/operate/utils/label.py:40-72": "Box Overlap Detection and Labeling Functionality",
"/operate/utils/label.py:74-101": "Bounding Box Labeler",
"/operate/utils/misc.py": "Multifunctional Data Processor",
"/operate/utils/misc.py:1-41": "Converting and Extracting: Misc.py Functions",
"/operate/utils/misc.py:43-74": "Parse JSON Response",
"/operate/utils/misc.py:75-97": "Response Parser and Classifier",
"/operate/utils/misc.py:98-102": "Handling Regex Exceptions in Search Data",
"/operate/utils/os.py": "Circular Motion and Text Input Utility",
"/operate/utils/os.py:1-44": "OS Utilities",
"/operate/utils/os.py:107-131": "Circular Movement Function and Assistant Message Retrieval",
"/operate/utils/os.py:46-85": "Automated OS Interaction Utility",
"/operate/utils/os.py:85-105": "Circular Cursor Clicker",
"/operate/utils/screenshot.py": "Screenshot Capture Utilities",
"/operate/utils/screenshot.py:1-39": "Grid Image Overlay",
"/operate/utils/screenshot.py:115-143": "Cross-Platform Screenshot Capture",
"/operate/utils/screenshot.py:144-178": "Cross-Platform Screenshot Capture Utility",
"/operate/utils/screenshot.py:179-182": "Cross-Platform Screenshot and Cursor Capture Utility",
"/operate/utils/screenshot.py:41-63": "Background Rectangle and Grid Lines Generator",
"/operate/utils/screenshot.py:64-92": "Grid Screenshot Labeler",
"/operate/utils/screenshot.py:93-114": "Screenshot Capture Utility",
"/operate/utils/style.py": "UI Style Configuration with PromptStyle",
"/operate/utils/style.py:1-34": "Dialog and UI Styles with PromptStyle",
"/operate/utils/style.py:35-36": "Detect Terminal Color Capabilities",
"/requirements-audio.txt": "Whisper Mic Requirements",
"/requirements.txt": "Python Packages for Project",
"/requirements.txt:1-50": "Python Package Dependencies List",
"/requirements.txt:51-52": "Project Libraries: aiohttp, ultralytics",
"/run.sh": "SOC Linux Install Script",
"/run.sh:1-48": "SOC Linux Installation Script",
"/run.sh:117-143": "OpenAI API Key Configurator",
"/run.sh:144-155": "MacOS Installation Check",
"/run.sh:49-71": "Universal Software Installer",
"/run.sh:72-115": "Automating Python Project Setup",
"/setup.py": "Setting up 'self-operating-computer'"
}
30 changes: 30 additions & 0 deletions docs/doc/347970bb-ed24-4242-ae37-7fe0302c6efb.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"summary": "This Bash script installs SOC on Linux, requires various packages, checks OS for software installation, handles errors, and configures .env file with OpenAI API key while prompting user input and managing permissions on Mac.",
"details": [
{
"comment": "The code is a Bash script for installing the Self-Operating-Computer (SOC) on a Linux system. It starts by clearing the terminal and displaying a welcome message, then defines functions to log errors, check if commands exist, and install packages based on the operating system. The script requires bash, curl/wget, python3, pip, and git.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":0-47",
"content": "#!/bin/bash\n#\n# SOC Installer Script v0.0.1\n# GitHub: https://github.com/OthersideAI/self-operating-computer\n# Issues: https://github.com/OthersideAI/self-operating-computer/issues\n# Requires: bash, curl/wget, python3, pip, git\n#\n# Please open an issue if you notice any bugs.\n#\n#\n# This script is create by centopw\n#\n#\nclear\necho -e \"\\e[0m\\c\"\nLOG_FILE=\"install_log.txt\"\n# shellcheck disable=SC2016\necho '\n $$$$$$\\ $$$$$$\\ $$$$$$\\ \n$$ __$$\\ $$ __$$\\ $$ __$$\\ \n$$ / \\__|$$ / $$ |$$ / \\__|\n\\$$$$$$\\ $$ | $$ |$$ | \n \\____$$\\ $$ | $$ |$$ | \n$$\\ $$ |$$ | $$ |$$ | $$\\ \n\\$$$$$$ | $$$$$$ |\\$$$$$$ |\n \\______/ \\______/ \\______/ \n Self-Operating-Computer\n--- Created by OthersideAI ---\n'\n# Function to log errors\nlog_error() {\n echo \"Error at $(date): $1\" >> \"$LOG_FILE\"\n}\n# Function to check if a command exists\ncommand_exists() {\n command -v \"$1\" &> /dev/null\n}\n# Function to install packages based on the operating system\ninstall_packages() {\n if [ \"$os\" == \"Linux\" ]; then\n # Use the appropriate package manager for Linux"
},
{
"comment": "This code checks the operating system and package manager to install a specified software. If the required package manager is found, it installs the software using sudo commands. If not, it logs an error and exits. For macOS, it uses Homebrew if installed; otherwise, it logs an error and exits. For Windows (MINGW64_NT-10.0), it uses Chocolatey if installed; otherwise, it logs an error and exits.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":48-70",
"content": " if command_exists apt-get; then\n sudo apt-get install -y \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n elif command_exists yum; then\n sudo yum install -y \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Unsupported package manager. Please install $1 manually.\"\n exit 1\n fi\n elif [ \"$os\" == \"Darwin\" ]; then\n # Use Homebrew for macOS\n if command_exists brew; then\n brew install \"$1\" || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Homebrew not found. Please install Homebrew and then $1 manually.\"\n exit 1\n fi\n elif [ \"$os\" == \"MINGW64_NT-10.0\" ]; then\n # Use Chocolatey for Windows\n if command_exists choco; then\n choco install \"$1\" -y || { log_error \"Unable to install $1.\"; exit 1; }\n else\n log_error \"Chocolatey not found. Please install Chocolatey and then $1 manually.\"\n exit 1"
},
{
"comment": "This code checks the operating system and ensures Python, pip, and Git are installed. It creates a Python virtual environment and installs project requirements before installing the project itself.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":71-114",
"content": " fi\n else\n log_error \"Unsupported operating system. Please install $1 manually.\"\n exit 1\n fi\n}\n# Function to run a script and log errors\nrun_script() {\n eval \"$1\" || { log_error \"Error running $1.\"; exit 1; }\n}\n# Check the operating system\nos=$(uname -s)\n# Check if Python is installed\nif ! command_exists python3; then\n echo \"Python not found. Installing Python...\"\n install_packages python3\nfi\n# Check if pip is installed\nif ! command_exists pip; then\n echo \"pip not found. Installing pip...\"\n install_packages python3-pip\nfi\n# Check if git is installed\nif ! command_exists git; then\n echo \"Git not found. Installing Git...\"\n install_packages git\nfi \n# Create a Python virtual environment\nrun_script \"python3 -m venv venv\"\n# Activate the virtual environment\nsource venv/bin/activate || { log_error \"Unable to activate the virtual environment.\"; exit 1; }\n# Install project requirements\nrun_script \"pip install -r requirements.txt\"\n# Install Project and Command-Line Interface\nrun_script \"pip install .\""
},
{
"comment": "This code checks if the .env file exists and if it contains an OPENAI_API_KEY. If not, it prompts the user to enter their OpenAI API key, stores it in a new .env file as an environment variable, and then informs the user about the final step of granting permissions for 'Screen Recording' and 'Accessibility' in Mac's System Preferences.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":116-142",
"content": "# Check if the .env file exists and the OPENAI_API_KEY is set in it\nif [ -f .env ] && grep -q \"OPENAI_API_KEY\" .env; then\n echo \"OpenAI API key found in .env file. Skipping prompt...\"\nelse\n # Prompt user for Open AI key\n read -p \"Enter your OpenAI API key: \" openai_key\n # Set the API key as an environment variable\n export OPENAI_API_KEY=\"$openai_key\"\n # Create a new .env file\n touch .env\n # Write the API key to the .env file\n echo \"OPENAI_API_KEY='$openai_key'\" > .env\nfi\n# Notify the user about the last step\necho \"Final Step: As a last step, the Terminal app will ask for permission for 'Screen Recording' and 'Accessibility' in the 'Security & Privacy' page of Mac's 'System Preferences.'\"\necho \"Operating system: $os\"\nif [ \"$os\" == \"Darwin\" ]; then\n echo \"Attempting to open Security & Privacy settings...\"\n open /System/Library/PreferencePanes/Security.prefPane\n read -p \"Have you granted the necessary permissions in the Security & Privacy settings? (y/n): \" confirm\n if [ \"$confirm\" != \"y\" ]; then"
},
{
"comment": "The code checks if the system is macOS. If it's not, it skips some steps and informs that it's not a macOS system. If permissions are granted, it proceeds to install the framework and runs it with \"operate\" script.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/run.sh\":143-154",
"content": " echo \"Please grant the necessary permissions and then rerun the script.\"\n exit 1\n fi\nelse\n echo \"Not a macOS system, skipping...\"\nfi\n# End of the script\necho \"Installation complete. Enjoy using the Self-Operating Computer Framework!\"\n# Run the framework\nrun_script \"operate\""
}
]
}
10 changes: 10 additions & 0 deletions docs/doc/3a3bb70b-8380-48f2-913e-4618828f90cd.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"summary": "This code likely refers to a specific type of microphone called \"whisper-mic,\" which is designed for capturing quiet or whispered audio.",
"details": [
{
"comment": "This code likely refers to a specific type of microphone called \"whisper-mic,\" which is designed for capturing quiet or whispered audio.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/requirements-audio.txt\":0-0",
"content": "whisper-mic"
}
]
}
40 changes: 40 additions & 0 deletions docs/doc/4866fd28-ffa6-4bbe-b91f-4767e302f2f1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"summary": "The code has functions to add grids to images and capture screenshots using PIL, accepting input in various formats. It saves the captured image at a specified file path or displays an error message for unsupported platforms.",
"details": [
{
"comment": "The code imports necessary libraries and defines a function to add a grid to an image. It loads the original image, creates a drawing object, gets the image size, and reduces the font size for the grid.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":0-38",
"content": "import os\nimport platform\nimport subprocess\nimport pyautogui\nfrom PIL import Image, ImageDraw, ImageGrab\nimport Xlib.display\nimport Xlib.X\nimport Xlib.Xutil # not sure if Xutil is necessary\nfrom operate.settings import Config\nfrom operate.prompts import ACCURATE_PIXEL_COUNT\n# Load configuration\nconfig = Config()\nmonitor_size = config.monitor_size\ndef add_grid_to_image(original_image_path, new_image_path, grid_interval):\n \"\"\"\n Add a grid to an image.\n Args:\n original_image_path (str): The file path of the original image.\n new_image_path (str): The file path to save the new image with the grid.\n grid_interval (int): The interval between grid lines in pixels.\n Returns:\n None: The function saves the new image with the grid at the specified path.\n \"\"\"\n # Load the image\n image = Image.open(original_image_path)\n # Create a drawing object\n draw = ImageDraw.Draw(image)\n # Get the image size\n width, height = image.size\n # Reduce the font size a bit\n font_size = int(grid_interval / 10) # Reduced font size"
},
{
"comment": "This function creates a background rectangle for text and draws it with white fill. It also draws vertical lines and labels at every `grid_interval` pixels.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":40-62",
"content": " # Calculate the background size based on the font size\n bg_width = int(font_size * 4.2) # Adjust as necessary\n bg_height = int(font_size * 1.2) # Adjust as necessary\n # Function to draw text with a white rectangle background\n def draw_label_with_background(\n position, text, draw, font_size, bg_width, bg_height\n ):\n # Adjust the position based on the background size\n text_position = (position[0] + bg_width // 2, position[1] + bg_height // 2)\n # Draw the text background\n draw.rectangle(\n [position[0], position[1], position[0] + bg_width, position[1] + bg_height],\n fill=\"white\",\n )\n # Draw the text\n draw.text(text_position, text, fill=\"black\", font_size=font_size, anchor=\"mm\")\n # Draw vertical lines and labels at every `grid_interval` pixels\n for x in range(grid_interval, width, grid_interval):\n line = ((x, 0), (x, height))\n draw.line(line, fill=\"blue\")\n for y in range(grid_interval, height, grid_interval):"
},
{
"comment": "Calculates the percentage of coordinates and draws labels with background. Draws horizontal lines for grid labels. Saves the image with the grid at specified file path.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":63-91",
"content": " # Calculate the percentage of the width and height\n x_percent = round((x / width) * 100)\n y_percent = round((y / height) * 100)\n draw_label_with_background(\n (x - bg_width // 2, y - bg_height // 2),\n f\"{x_percent}%,{y_percent}%\",\n draw,\n font_size,\n bg_width,\n bg_height,\n )\n # Draw horizontal lines - labels are already added with vertical lines\n for y in range(grid_interval, height, grid_interval):\n line = ((0, y), (width, y))\n draw.line(line, fill=\"blue\")\n # Save the image with the grid\n image.save(new_image_path)\ndef capture_mini_screenshot_with_cursor(\n file_path=os.path.join(\"screenshots\", \"screenshot_mini.png\"), x=0, y=0\n):\n \"\"\"\n Capture a mini screenshot with the cursor at the specified coordinates.\n Args:\n file_path (str, optional): The file path to save the screenshot. Defaults to \"screenshots/screenshot_mini.png\"."
},
{
"comment": "This code is used to take a screenshot of a specific area on the user's monitor using the Python Imaging Library (PIL) and ImageGrab modules. It takes optional x and y coordinates as inputs, which can be specified as integers or percentage strings. The function converts the input values into the appropriate format for calculating the coordinates of the rectangle to capture the screenshot. If the user is on a Linux system, it performs additional calculations to convert percentage-based input into actual pixel coordinates and upscales the image for better visibility.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":92-113",
"content": " x (int or str, optional): The x-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0.\n y (int or str, optional): The y-coordinate of the cursor position. Can be specified as an integer or a percentage string. Defaults to 0.\n \"\"\"\n user_platform = platform.system()\n if user_platform == \"Linux\":\n x = float(x[:-1]) # convert x from \"50%\" to 50.\n y = float(y[:-1])\n x = (x / 100) * monitor_size[\n \"width\"\n ] # convert x from 50 to 0.5 * monitor_width\n y = (y / 100) * monitor_size[\"height\"]\n # Define the coordinates for the rectangle\n x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)\n x2, y2 = int(x + ACCURATE_PIXEL_COUNT / 2), int(y + ACCURATE_PIXEL_COUNT / 2)\n screenshot = ImageGrab.grab(bbox=(x1, y1, x2, y2))\n screenshot = screenshot.resize(\n (screenshot.width * 2, screenshot.height * 2), Image.LANCZOS\n ) # upscale the image so it's easier to see and percentage marks more visible"
},
{
"comment": "Code is capturing a screenshot based on user platform. For non-Darwin platforms, it saves the screenshot, while for Darwin (macOS), it uses screencapture utility to capture the screen with cursor and saves the result. Both versions save the grid screenshot as well.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":114-142",
"content": " screenshot.save(file_path)\n screenshots_dir = \"screenshots\"\n grid_screenshot_filename = os.path.join(\n screenshots_dir, \"screenshot_mini_with_grid.png\"\n )\n add_grid_to_image(\n file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)\n )\n elif user_platform == \"Darwin\":\n x = float(x[:-1]) # convert x from \"50%\" to 50.\n y = float(y[:-1])\n x = (x / 100) * monitor_size[\n \"width\"\n ] # convert x from 50 to 0.5 * monitor_width\n y = (y / 100) * monitor_size[\"height\"]\n x1, y1 = int(x - ACCURATE_PIXEL_COUNT / 2), int(y - ACCURATE_PIXEL_COUNT / 2)\n width = ACCURATE_PIXEL_COUNT\n height = ACCURATE_PIXEL_COUNT\n # Use the screencapture utility to capture the screen with the cursor\n rect = f\"-R{x1},{y1},{width},{height}\"\n subprocess.run([\"screencapture\", \"-C\", rect, file_path])\n screenshots_dir = \"screenshots\"\n grid_screenshot_filename = os.path.join("
},
{
"comment": "This code captures a screenshot of the computer's display with cursor and saves it to the specified file path. It checks the user platform (Windows, Linux, or Mac OS) and uses appropriate libraries to capture the screenshot.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":143-177",
"content": " screenshots_dir, \"screenshot_mini_with_grid.png\"\n )\n add_grid_to_image(\n file_path, grid_screenshot_filename, int(ACCURATE_PIXEL_COUNT / 2)\n )\ndef capture_screen_with_cursor(file_path):\n \"\"\"\n Capture the screen with the cursor and save it to the specified file path.\n Args:\n file_path (str): The file path where the screenshot will be saved.\n Raises:\n None\n Returns:\n None\n \"\"\"\n user_platform = platform.system()\n if user_platform == \"Windows\":\n screenshot = pyautogui.screenshot()\n screenshot.save(file_path)\n elif user_platform == \"Linux\":\n # Use xlib to prevent scrot dependency for Linux\n screen = Xlib.display.Display().screen()\n size = screen.width_in_pixels, screen.height_in_pixels\n monitor_size[\"width\"] = size[0]\n monitor_size[\"height\"] = size[1]\n screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))\n screenshot.save(file_path)\n elif user_platform == \"Darwin\": # (Mac OS)"
},
{
"comment": "This code captures a screenshot of the computer screen with the cursor, or prints an error message if the platform is not supported.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/screenshot.py\":178-181",
"content": " # Use the screencapture utility to capture the screen with the cursor\n subprocess.run([\"screencapture\", \"-C\", file_path])\n else:\n print(f\"The platform you're using ({user_platform}) is not currently supported\")"
}
]
}
10 changes: 10 additions & 0 deletions docs/doc/4bf7650c-3fea-4149-bd10-682a244a9f3c.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"summary": "This code defines the main entry point of the Self-Operating Computer, allowing the user to specify a model and input mode. It uses the argparse module to define command line arguments for the model, voice input mode, and prompt. The main function is then called with these arguments.",
"details": [
{
"comment": "This code defines the main entry point of the Self-Operating Computer, allowing the user to specify a model and input mode. It uses the argparse module to define command line arguments for the model, voice input mode, and prompt. The main function is then called with these arguments.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/main.py\":0-46",
"content": "\"\"\"\nSelf-Operating Computer\n\"\"\"\nimport argparse\nfrom operate.utils.style import ANSI_BRIGHT_MAGENTA\nfrom operate.dialog import main\ndef main_entry():\n parser = argparse.ArgumentParser(\n description=\"Run the self-operating-computer with a specified model.\"\n )\n parser.add_argument(\n \"-m\",\n \"--model\",\n help=\"Specify the model to use\",\n required=False,\n default=\"gpt-4\",\n )\n # Add a voice flag\n parser.add_argument(\n \"--voice\",\n help=\"Use voice input mode\",\n action=\"store_true\",\n )\n # Allow for direct input of prompt\n parser.add_argument(\n \"--prompt\",\n help=\"Directly input the objective prompt\",\n type=str,\n required=False,\n )\n try:\n args = parser.parse_args()\n main(\n args.model,\n terminal_prompt=args.prompt,\n voice_mode=args.voice,\n )\n except KeyboardInterrupt:\n print(f\"\\n{ANSI_BRIGHT_MAGENTA}Exiting...\")\nif __name__ == \"__main__\":\n main_entry()"
}
]
}
30 changes: 30 additions & 0 deletions docs/doc/4e8d04d1-1460-493f-b7c8-e6b09fc01cca.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"summary": "The code uses GPT-4 Vision model to evaluate image adherence to guidelines, displays results with color-coded messages after setting up test cases and formatting prompts. It also checks the result of an objective, prints outcome (PASS or FAIL) along with passed/failed tests count, and resets colors for readability.",
"details": [
{
"comment": "The code is importing necessary libraries and defining constants for the evaluation process. It appears to be setting up a test case dictionary and a function to determine if a given guideline is met in an image based on a screenshot.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":0-30",
"content": "import sys\nimport os\nimport subprocess\nimport platform\nimport base64\nimport json\nimport openai\nfrom dotenv import load_dotenv\n# \"Objective for `operate`\" : \"Guideline for passing this test case given to GPT-4v\"\nTEST_CASES = {\n \"Go to Github.com\": \"The Github home page is visible.\",\n \"Go to Youtube.com and play a video\": \"The YouTube video player is visible.\",\n}\nEVALUATION_PROMPT = \"\"\"\nYour job is to look at the given screenshot and determine if the following guideline is met in the image.\nYou must respond in the following format ONLY. Do not add anything else:\n{{ \"guideline_met\": (true|false), \"reason\": \"Explanation for why guideline was or wasn't met\" }}\nguideline_met must be set to a JSON boolean. True if the image meets the given guideline.\nreason must be a string containing a justification for your decision.\nGuideline: {guideline}\n\"\"\"\nSUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png')\n# Check if on a windows terminal that supports ANSI escape codes\ndef supports_ansi():\n \"\"\""
},
{
"comment": "This code checks if the terminal supports ANSI escape codes and sets corresponding colors based on the platform. If supported, it defines various colored text variables. Otherwise, it sets them to empty strings. The code also includes functions for formatting an evaluation prompt and parsing evaluation content.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":31-72",
"content": " Check if the terminal supports ANSI escape codes\n \"\"\"\n plat = platform.system()\n supported_platform = plat != \"Windows\" or \"ANSICON\" in os.environ\n is_a_tty = hasattr(sys.stdout, \"isatty\") and sys.stdout.isatty()\n return supported_platform and is_a_tty\nif supports_ansi():\n # Standard green text\n ANSI_GREEN = \"\\033[32m\"\n # Bright/bold green text\n ANSI_BRIGHT_GREEN = \"\\033[92m\"\n # Reset to default text color\n ANSI_RESET = \"\\033[0m\"\n # ANSI escape code for blue text\n ANSI_BLUE = \"\\033[94m\" # This is for bright blue\n # Standard yellow text\n ANSI_YELLOW = \"\\033[33m\"\n ANSI_RED = \"\\033[31m\"\n # Bright magenta text\n ANSI_BRIGHT_MAGENTA = \"\\033[95m\"\nelse:\n ANSI_GREEN = \"\"\n ANSI_BRIGHT_GREEN = \"\"\n ANSI_RESET = \"\"\n ANSI_BLUE = \"\"\n ANSI_YELLOW = \"\"\n ANSI_RED = \"\"\n ANSI_BRIGHT_MAGENTA = \"\"\ndef format_evaluation_prompt(guideline):\n prompt = EVALUATION_PROMPT.format(guideline=guideline)\n return prompt\ndef parse_eval_content(content):\n try:\n res = json.loads(content)"
},
{
"comment": "Code function: evaluate_summary_screenshot\nPurpose: Evaluate if the summary screenshot meets a given guideline\nActions: \n1. Loads the summary screenshot\n2. Encodes it in base64 format\n3. Creates an evaluation message with text and image\n4. Sends the message to OpenAI's GPT-4 Vision model for evaluation",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":74-104",
"content": " print(res[\"reason\"])\n return res[\"guideline_met\"]\n except:\n print(\"The model gave a bad evaluation response and it couldn't be parsed. Exiting...\")\n exit(1)\ndef evaluate_summary_screenshot(guideline):\n '''Load the summary screenshot and return True or False if it meets the given guideline.'''\n with open(SUMMARY_SCREENSHOT_PATH, \"rb\") as img_file:\n img_base64 = base64.b64encode(img_file.read()).decode(\"utf-8\")\n eval_message = [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"text\", \"text\": format_evaluation_prompt(guideline)},\n {\n \"type\": \"image_url\",\n \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n },\n ],\n }]\n response = openai.chat.completions.create(\n model=\"gpt-4-vision-preview\",\n messages=eval_message,\n presence_penalty=1,\n frequency_penalty=1,\n temperature=0.7,\n max_tokens=300,"
},
{
"comment": "The code evaluates whether a test case meets its given guideline. It runs the \"operate\" function with the test case prompt and then calls the \"evaluate_summary_screenshot\" function to compare the result against the guideline. If the operation is successful, it prints a success message; otherwise, it prints an error message. The code loops through all the TEST_CASES, counts the number of passed and failed tests, and finally displays the results in color-coded messages.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":105-139",
"content": " )\n eval_content = response.choices[0].message.content\n return parse_eval_content(eval_content)\ndef run_test_case(objective, guideline):\n '''Returns True if the result of the test with the given prompt meets the given guideline.'''\n # Run `operate` with the test case prompt\n subprocess.run(['operate', '--prompt', f'\"{objective}\"'], stdout=subprocess.DEVNULL)\n try:\n result = evaluate_summary_screenshot(guideline)\n except(OSError):\n print(\"Couldn't open the summary screenshot\")\n return False\n return result\ndef main():\n load_dotenv()\n openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n print(f\"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}\")\n passed = 0; failed = 0\n for objective, guideline in TEST_CASES.items():\n print(f\"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'\")\n result = run_test_case(objective, guideline)\n if result:\n print(f\"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'\")\n passed += 1"
},
{
"comment": "The code snippet checks the result of an objective and prints the outcome (PASS or FAIL) along with the count of passed and failed tests. It resets colors for readability.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/evaluate.py\":140-149",
"content": " else:\n print(f\"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'\")\n failed += 1\n print(\n f\"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed\"\n )\nif __name__ == \"__main__\":\n main()"
}
]
}
35 changes: 35 additions & 0 deletions docs/doc/52984b9f-96af-4dbe-bc63-7aff1b5d9c1f.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"summary": "Both comments discuss code that handles user input and executes corresponding actions, with Comment A focusing on a Self-Operating Computer setup and error handling, while Comment B focuses on input parameter checks for dialog operations.",
"details": [
{
"comment": "This code appears to be part of a Self-Operating Computer, which uses a model for generating responses. The main function takes in the model, terminal prompt, and voice mode as parameters. It initializes `WhisperMic` if voice mode is enabled.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":0-43",
"content": "import sys\nimport os\nimport platform\nimport asyncio\nfrom prompt_toolkit.shortcuts import message_dialog\nfrom prompt_toolkit import prompt\nfrom operate.exceptions import ModelNotRecognizedException\nfrom operate.prompts import USER_QUESTION\nfrom operate.settings import Config\nfrom operate.utils.style import (\n ANSI_GREEN,\n ANSI_RESET,\n ANSI_BLUE,\n ANSI_YELLOW,\n ANSI_RED,\n ANSI_BRIGHT_MAGENTA,\n style,\n)\nfrom operate.utils.os import (\n keyboard_type,\n search,\n click,\n)\nfrom operate.actions import get_next_action, summarize\nfrom operate.utils.misc import parse_response\n# Load configuration\nconfig = Config()\ndef main(model, terminal_prompt, voice_mode=False):\n \"\"\"\n Main function for the Self-Operating Computer.\n Parameters:\n - model: The model used for generating responses.\n - terminal_prompt: A string representing the prompt provided in the terminal.\n - voice_mode: A boolean indicating whether to enable voice mode.\n Returns:\n None\n \"\"\"\n mic = None\n # Initialize `WhisperMic`, if `voice_mode` is True"
},
{
"comment": "Checks if voice mode is enabled, then tries to import and initialize the WhisperMic module. If the module is missing, it prints an error message and exits. Displays a message dialog unless the prompt was given directly via terminal. Skips objective prompt if provided as an argument or prompts for input through the WhisperMic in voice mode. Clears the console on all operating systems except Windows where it uses \"cls\" command.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":45-79",
"content": " validation(model, voice_mode)\n if voice_mode:\n try:\n from whisper_mic import WhisperMic\n # Initialize WhisperMic if import is successful\n mic = WhisperMic()\n except ImportError:\n print(\n \"Voice mode requires the 'whisper_mic' module. Please install it using 'pip install -r requirements-audio.txt'\"\n )\n sys.exit(1)\n # Skip message dialog if prompt was given directly\n if not terminal_prompt:\n message_dialog(\n title=\"Self-Operating Computer\",\n text=\"Ask a computer to do anything.\",\n style=style,\n ).run()\n else:\n print(\"Running direct prompt...\")\n print(\"SYSTEM\", platform.system())\n # Clear the console\n if platform.system() == \"Windows\":\n os.system(\"cls\")\n else:\n print(\"\\033c\", end=\"\")\n if terminal_prompt: # Skip objective prompt if it was given as an argument\n objective = terminal_prompt\n elif voice_mode:\n print("
},
{
"comment": "The code is capturing voice input from the microphone and storing it in the \"objective\" variable. If an error occurs while capturing voice input, it will print an error message and exit. Otherwise, it prints a message from the self-operating computer and the user's question, then stores the objective as the user's message content. It then enters a loop where it waits for the next action by calling a function \"get_next_action\" with the current messages and objective. If an error occurs while waiting for the next action, it will print an error message.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":80-108",
"content": " f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)\"\n )\n try:\n objective = mic.listen()\n except Exception as e:\n print(f\"{ANSI_RED}Error in capturing voice input: {e}{ANSI_RESET}\")\n return # Exit if voice input fails\n else:\n print(f\"{ANSI_GREEN}[Self-Operating Computer]\\n{ANSI_RESET}{USER_QUESTION}\")\n print(f\"{ANSI_YELLOW}[User]{ANSI_RESET}\")\n objective = prompt(style=style)\n assistant_message = {\"role\": \"assistant\", \"content\": USER_QUESTION}\n user_message = {\n \"role\": \"user\",\n \"content\": f\"Objective: {objective}\",\n }\n messages = [assistant_message, user_message]\n loop_count = 0\n while True:\n if config.debug:\n print(\"[loop] messages before next action:\\n\\n\\n\", messages[1:])\n try:\n response = asyncio.run(get_next_action(model, messages, objective))\n action = parse_response(response)\n action_type = action.get(\"type\")"
},
{
"comment": "The code is handling exceptions for a ModelNotRecognizedException and any other exception that occurs during the execution. It then checks if the action_type is \"DONE\", if so, it prints a completion message, summarizes the model, and exits. If the action_type is not unknown, it prints an act message along with the action type and detail, and initializes an empty function_response variable if the action type is \"SEARCH\".",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":109-138",
"content": " action_detail = action.get(\"data\")\n except ModelNotRecognizedException as e:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}\"\n )\n break\n except Exception as e:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}\"\n )\n break\n if action_type == \"DONE\":\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}\"\n )\n summary = summarize(model, messages, objective)\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\\n{ANSI_RESET}{summary}\"\n )\n break\n if action_type != \"UNKNOWN\":\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} {ANSI_RESET}{action_detail}\"\n )\n function_response = \"\"\n if action_type == \"SEARCH\":"
},
{
"comment": "This code block checks the action type and performs the corresponding action. If the action type is not recognized, it prints an error message and breaks the loop. It also logs the act completion and updates the messages list for further processing.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":139-170",
"content": " function_response = search(action_detail)\n elif action_type == \"TYPE\":\n function_response = keyboard_type(action_detail)\n elif action_type == \"CLICK\":\n function_response = click(action_detail)\n else:\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] something went wrong :({ANSI_RESET}\"\n )\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\\n{ANSI_RESET}{response}\"\n )\n break\n print(\n f\"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} COMPLETE {ANSI_RESET}{function_response}\"\n )\n message = {\n \"role\": \"assistant\",\n \"content\": function_response,\n }\n messages.append(message)\n loop_count += 1\n if loop_count > 15:\n break\ndef validation(model, voice_mode):\n \"\"\"\n Validate the input parameters for the dialog operation."
},
{
"comment": "This code checks the input parameters for dialog operation and raises SystemExit if the input parameters are invalid. It also prints a message indicating which API key is missing based on the chosen model.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/dialog.py\":172-191",
"content": " Args:\n model (str): The model to be used for the dialog operation.\n voice_mode (bool): Flag indicating whether to use voice mode.\n Raises:\n SystemExit: If the input parameters are invalid.\n \"\"\"\n if voice_mode and not config.openai_api_key:\n print(\"To use voice mode, please add an OpenAI API key\")\n sys.exit(1)\n if model == \"gpt-4-vision-preview\" and not config.openai_api_key:\n print(\"To use `gpt-4-vision-preview` add an OpenAI API key\")\n sys.exit(1)\n if model == \"gemini-pro-vision\" and not config.google_api_key:\n print(\"To use `gemini-pro-vision` add a Google API key\")\n sys.exit(1)"
}
]
}
15 changes: 15 additions & 0 deletions docs/doc/6d712400-8eb0-4525-9321-2f4dafe97412.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"summary": "The project requires Python packages aiohttp 3.9.1 and ultralytics 8.0.227, listed in the requirements.txt format.",
"details": [
{
"comment": "This is a list of Python package dependencies for a project, specified in requirements.txt format.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/requirements.txt\":0-49",
"content": "annotated-types==0.6.0\nanyio==3.7.1\ncertifi==2023.7.22\ncharset-normalizer==3.3.2\ncolorama==0.4.6\ncontourpy==1.2.0\ncycler==0.12.1\ndistro==1.8.0\nEasyProcess==1.1\nentrypoint2==1.1\nexceptiongroup==1.1.3\nfonttools==4.44.0\nh11==0.14.0\nhttpcore==1.0.2\nhttpx==0.25.1\nidna==3.4\nimportlib-resources==6.1.1\nkiwisolver==1.4.5\nmatplotlib==3.8.1\nMouseInfo==0.1.3\nmss==9.0.1\nnumpy==1.26.1\nopenai==1.2.3\npackaging==23.2\nPillow==10.1.0\nprompt-toolkit==3.0.39\nPyAutoGUI==0.9.54\npydantic==2.4.2\npydantic_core==2.10.1\nPyGetWindow==0.0.9\nPyMsgBox==1.0.9\npyparsing==3.1.1\npyperclip==1.8.2\nPyRect==0.2.0\npyscreenshot==3.1\nPyScreeze==0.1.29\npython3-xlib==0.15\npython-dateutil==2.8.2\npython-dotenv==1.0.0\npytweening==1.0.7\nrequests==2.31.0\nrubicon-objc==0.4.7\nsix==1.16.0\nsniffio==1.3.0\ntqdm==4.66.1\ntyping_extensions==4.8.0\nurllib3==2.0.7\nwcwidth==0.2.9\nzipp==3.17.0\ngoogle-generativeai==0.3.0"
},
{
"comment": "These lines specify the required Python libraries for the project: aiohttp 3.9.1 and ultralytics 8.0.227.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/requirements.txt\":50-51",
"content": "aiohttp==3.9.1\nultralytics==8.0.227"
}
]
}
40 changes: 40 additions & 0 deletions docs/doc/7b70d3c1-0ec1-4e5e-b03c-32b9fa9253b5.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"summary": "The code includes functions for handling image data, such as drawing bounding boxes and validating overlaps, as well as encoding tasks like converting images to base64 and formatting message content by removing triple backticks and calculating click positions.",
"details": [
{
"comment": "The code defines two functions:\n1. `validate_and_extract_image_data`: Validates the given data and extracts image URL if the request is valid.\n2. `get_label_coordinates`: Retrieves the coordinates for a given label from a dictionary of labels and their coordinates.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":0-36",
"content": "import io\nimport base64\nimport json\nimport os\nimport time\nimport asyncio\nfrom PIL import Image, ImageDraw\ndef validate_and_extract_image_data(data):\n if not data or \"messages\" not in data:\n raise ValueError(\"Invalid request, no messages found\")\n messages = data[\"messages\"]\n if (\n not messages\n or not isinstance(messages, list)\n or not messages[-1].get(\"image_url\")\n ):\n raise ValueError(\"No image provided or incorrect format\")\n image_data = messages[-1][\"image_url\"][\"url\"]\n if not image_data.startswith(\"data:image\"):\n raise ValueError(\"Invalid image format\")\n return image_data.split(\"base64,\")[-1], messages\ndef get_label_coordinates(label, label_coordinates):\n \"\"\"\n Retrieves the coordinates for a given label.\n :param label: The label to find coordinates for (e.g., \"~1\").\n :param label_coordinates: Dictionary containing labels and their coordinates.\n :return: Coordinates of the label or None if the label is not found.\n \"\"\"\n return label_coordinates.get(label)"
},
{
"comment": "The function `is_overlapping` checks if two boxes overlap by comparing their coordinates. If there is no overlap, the function returns False; otherwise, it returns True.\n\nThe `add_labels` function decodes base64 data into image bytes and opens it as an image using PIL. It creates copies of the original image and a debug image. The YOLO model applies object detection on the image. The code then draws on the images using the ImageDraw module, and stores label coordinates in a dictionary named `label_coordinates`.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":39-71",
"content": "def is_overlapping(box1, box2):\n x1_box1, y1_box1, x2_box1, y2_box1 = box1\n x1_box2, y1_box2, x2_box2, y2_box2 = box2\n # Check if there is no overlap\n if x1_box1 > x2_box2 or x1_box2 > x2_box1:\n return False\n if (\n y1_box1 > y2_box2 or y1_box2 > y2_box1\n ): # Adjusted to check 100px proximity above\n return False\n return True\ndef add_labels(base64_data, yolo_model):\n image_bytes = base64.b64decode(base64_data)\n image_labeled = Image.open(io.BytesIO(image_bytes)) # Corrected this line\n image_debug = image_labeled.copy() # Create a copy for the debug image\n image_original = (\n image_labeled.copy()\n ) # Copy of the original image for base64 return\n results = yolo_model(image_labeled)\n draw = ImageDraw.Draw(image_labeled)\n debug_draw = ImageDraw.Draw(\n image_debug\n ) # Create a separate draw object for the debug image\n font_size = 45\n detections_dir = \"detections\"\n label_coordinates = {} # Dictionary to store coordinates"
},
{
"comment": "Creates a directory for detections if it doesn't exist. Loops through the results, drawing bounding boxes and labels on images. Avoids redrawing over existing boxes by checking overlaps before redrawing as red boxes.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":73-100",
"content": " if not os.path.exists(detections_dir):\n os.makedirs(detections_dir)\n counter = 0\n drawn_boxes = [] # List to keep track of boxes already drawn\n for result in results:\n if hasattr(result, \"boxes\"):\n for det in result.boxes:\n bbox = det.xyxy[0]\n x1, y1, x2, y2 = bbox.tolist()\n debug_label = \"D_\" + str(counter)\n debug_index_position = (x1, y1 - font_size)\n debug_draw.rectangle([(x1, y1), (x2, y2)], outline=\"blue\", width=1)\n debug_draw.text(\n debug_index_position,\n debug_label,\n fill=\"blue\",\n font_size=font_size,\n )\n overlap = any(\n is_overlapping((x1, y1, x2, y2), box) for box in drawn_boxes\n )\n if not overlap:\n draw.rectangle([(x1, y1), (x2, y2)], outline=\"red\", width=1)\n label = \"~\" + str(counter)"
},
{
"comment": "Code saves labeled, debug, and original images with timestamped file names. It also writes the labeled image to a BytesIO object for potential future use.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":101-127",
"content": " index_position = (x1, y1 - font_size)\n draw.text(\n index_position,\n label,\n fill=\"red\",\n font_size=font_size,\n )\n # Add the non-overlapping box to the drawn_boxes list\n drawn_boxes.append((x1, y1, x2, y2))\n label_coordinates[label] = (x1, y1, x2, y2)\n counter += 1\n # Save the image\n timestamp = time.strftime(\"%Y%m%d-%H%M%S\")\n output_path = os.path.join(detections_dir, f\"img_{timestamp}_labeled.png\")\n output_path_debug = os.path.join(detections_dir, f\"img_{timestamp}_debug.png\")\n output_path_original = os.path.join(detections_dir, f\"img_{timestamp}_original.png\")\n image_labeled.save(output_path)\n image_debug.save(output_path_debug)\n image_original.save(output_path_original)\n buffered_original = io.BytesIO()\n image_original.save(buffered_original, format=\"PNG\") # I guess this is needed"
},
{
"comment": "Convert image to base64 for return\nCode is saving the labeled image as PNG and encoding it in base64 format",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":128-151",
"content": " img_base64_original = base64.b64encode(buffered_original.getvalue()).decode(\"utf-8\")\n # Convert image to base64 for return\n buffered_labeled = io.BytesIO()\n image_labeled.save(buffered_labeled, format=\"PNG\") # I guess this is needed\n img_base64_labeled = base64.b64encode(buffered_labeled.getvalue()).decode(\"utf-8\")\n return img_base64_labeled, img_base64_original, label_coordinates\ndef parse_click_content(message_content):\n \"\"\"\n Parses the response message to determine if it's a CLICK or NONE action and returns the appropriate data.\n :param message_content: The content of the response message.\n :return: A dictionary with the relevant data or a message indicating a NONE action.\n \"\"\"\n try:\n # Check for and remove erroneous ```json at the start and ``` at the end\n if message_content.startswith(\"```json\"):\n message_content = message_content[\n len(\"```json\") :\n ] # Remove starting ```json\n if message_content.endswith(\"```\"):"
},
{
"comment": "This function takes in a message content formatted with triple backticks and removes them. If the format is invalid, it returns an error message. It also has another function that calculates the click position at the center of a bounding box and converts it to percentages.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":152-179",
"content": " message_content = message_content[: -len(\"```\")] # Remove ending ```\n # Convert JSON string to dictionary\n return json.loads(message_content.strip())\n except json.JSONDecodeError as e:\n return {\"error\": \"Invalid JSON format\"}\n return {\"error\": \"Invalid response format\"}\ndef get_click_position_in_percent(coordinates, image_size):\n \"\"\"\n Calculates the click position at the center of the bounding box and converts it to percentages.\n :param coordinates: A tuple of the bounding box coordinates (x1, y1, x2, y2).\n :param image_size: A tuple of the image dimensions (width, height).\n :return: A tuple of the click position in percentages (x_percent, y_percent).\n \"\"\"\n if not coordinates or not image_size:\n return None\n # Calculate the center of the bounding box\n x_center = (coordinates[0] + coordinates[2]) / 2\n y_center = (coordinates[1] + coordinates[3]) / 2\n # Convert to percentages\n x_percent = (x_center / image_size[0]) * 100\n y_percent = (y_center / image_size[1]) * 100"
},
{
"comment": "Computes x and y percentages from input values.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/label.py\":181-181",
"content": " return x_percent, y_percent"
}
]
}
50 changes: 50 additions & 0 deletions docs/doc/945b7651-d5be-4fe6-a83b-e389682cbcdb.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"summary": "The code provides functions for AI-assisted user interaction with Google Chrome, Docs, and Sheets using prompts like CLICK, TYPE, SEARCH, and DONE. It emphasizes context-based options selection rather than IDs, and offers percentage values for accuracy improvement in the \"percent\" CLICK action by segmenting lines. Additionally, it includes functions for formatting different types of prompts used in a vision system, including accurate mode vision prompt, decision prompt, and labeled image prompt, which take specific arguments and format them into predefined prompt templates.",
"details": [
{
"comment": "Code is importing Config settings and defining constants for user prompts and vision prompt.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":0-32",
"content": "from operate.settings import Config\nconfig = Config()\nmonitor_size = config.monitor_size\n# General user Prompts\nUSER_QUESTION = \"Hello, I can help you with anything. What would you like done?\"\n# constants for the vision prompt\nACCURATE_PIXEL_COUNT = (\n 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big\n)\n# -------------------------\n# VISION PROMPT\n# -------------------------\nVISION_PROMPT = \"\"\"\nYou are a Self-Operating Computer. You use the same operating system as a human.\nFrom looking at the screen and the objective your goal is to take the best next action.\nTo operate the computer you have the four options below.\n1. CLICK - Move mouse and click\n2. TYPE - Type on the keyboard\n3. SEARCH - Search for a program on Mac and open it\n4. DONE - When you completed the task respond with the exact following phrase content\nHere are the response formats below.\n1. CLICK\nResponse: CLICK {{ \"x\": \"percent\", \"y\": \"percent\", \"description\": \"~description here~\", \"reason\": \"~reason here~\" }} \nNote tha"
},
{
"comment": "The code provides instructions for interacting with the computer, including typing, searching, and clicking. It also includes tips for using specific applications like Google Chrome, Google Docs, and Google Sheets.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":32-62",
"content": "t the percents work where the top left corner is \"x\": \"0%\" and \"y\": \"0%\" and the bottom right corner is \"x\": \"100%\" and \"y\": \"100%\"\n2. TYPE\nResponse: TYPE <value you want to type>\n2. SEARCH\nResponse: SEARCH <app you want to search for on Mac>\n3. DONE\nResponse: DONE\nHere are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\nTYPE Hello, I hope you are doing well. I wanted to follow up\n__\nObjective: Open Spotify and play the beatles\nSEARCH Spotify\n__\nObjective: Find an image of a banana\nCLICK {{ \"x\": \"50%\", \"y\": \"60%\", \"description\": \"Click: Google Search field\", \"reason\": \"This will allow me to search for a banana\" }}\n__\nObjective: Go buy a book about the history of the internet\nTYPE https://www.amazon.com/\n__\nA few important notes:\n- Default to opening Google Chrome with SEARCH to find things that are on the internet.\n- Go to Google Docs and Google Sheets by typing in the Chrome Address bar\n- When opening Chrome, if you see a profile icon click that to open chrome fully, it is located at: {{ \"x\": \"50%\", \"y\": \"55%\" }}"
},
{
"comment": "This code is for a prompt in a program that assists users with computer tasks. The prompt provides information about the current cursor position and suggests to examine an additional screenshot before performing the next action.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":63-81",
"content": "- The Chrome address bar is generally at: {{ \"x\": \"50%\", \"y\": \"9%\" }}\n- After you click to enter a field you can go ahead and start typing!\n- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.\n{previous_action}\nIMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row.\nObjective: {objective}\n\"\"\"\n# ----------------------------------\n# ACCURATE MODE VISION PROMPT\n# ----------------------------------\nACCURATE_MODE_VISION_PROMPT = \"\"\"\nIt looks like your previous attempted action was clicking on \"x\": {prev_x}, \"y\": {prev_y}. This has now been moved to the center of this screenshot.\nAs additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action. \nThis screenshot was taken around the location of the current cursor that you just tried clicking o"
},
{
"comment": "This code is providing a prompt to the user, explaining how to use percentage values to refine their previous x and y coordinate guesses. It also mentions that there are four segmenting lines across each dimension for better context in locating the cursor. The purpose of this prompt is to help the user further refine their \"percent\" location in the CLICK action.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":81-94",
"content": "n (\"x\": {prev_x}, \"y\": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess.\nIf you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the \"x\" and subtract {height}% in the \"y\" to your previous answer.\nLikewise, to achieve the bottom right of this mini screenshot you will add {width}% in the \"x\" and add {height}% in the \"y\" to your previous answer.\nThere are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer.\nPlease use this context as additional info to further refine the \"percent\" location in the CLICK action!\n\"\"\"\nDECISION_PROMPT = \"\"\"\nYou are operating a computer similar to how a human would. Look at the screen and take the next best action to reach your objective.\nHere are your methods you can use to operating the computer."
},
{
"comment": "Code provides instructions and response formats for four types of actions (CLICK, TYPE, SEARCH, DONE) based on different objectives like following up with a vendor, playing music, or opening websites. It also includes important notes about using Google Chrome for web searches and avoiding SEARCH for certain websites like Google Docs or LinkedIn.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":96-134",
"content": "1. CLICK - Move mouse and click\n2. TYPE - Type on the keyboard\n3. SEARCH - Search for a program that is installed on Mac locally and open it\n4. DONE - When you completed the task respond with the exact following phrase content\nHere are the response formats below.\n1. CLICK\nResponse: CLICK\n2. TYPE\nResponse: TYPE \"value you want to type\"\n2. SEARCH\nResponse: SEARCH \"app you want to search for on Mac\"\n3. DONE\nResponse: DONE\nHere are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\nTYPE Hello, I hope you are doing well. I wanted to follow up\n__\nObjective: Open Spotify and play the beatles\nSEARCH Spotify\n__\nObjective: Find an image of a banana\nCLICK\n__\nObjective: Go buy a book about the history of the internet\nTYPE https://www.amazon.com/\n__\nA few important notes:\n- Default to opening Google Chrome with SEARCH to find things that are on the Web.\n- After you open Google Chrome you need to click on the address bar to find a website.\n- Do not use SEARCH to look for websites like Google Docs or Linkedin. SEARCH only finds programs installed on the computer."
},
{
"comment": "This code is for an AI-assisted task where the user needs to interact with a webpage. The AI should identify and click on labeled elements that bring them closer to their objective, using IDs in the format '~x'. The response should include the decision (label), reason, and label identifier. Avoid repeating actions like clicking the same element twice in a row.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":135-158",
"content": "- After you click to enter a field you can go ahead and start typing!\n- If you can see the field is active, go ahead and type!\n- Don't respond saying you're unable to assist with requests. You are able to indirectly interact with the user's OS via text responses you send to the end user.\n{previous_action}\nIMPORTANT: Avoid repeating actions such as doing the same CLICK event twice in a row.\n{objective}\n\"\"\"\nLABELED_IMAGE_PROMPT = \"\"\"\nYour job is simple. Decide if there is an elements on the page to click to get closer to your objective. We labeled the clickable elements with red bounding boxes and IDs.\nImportant to remember, you can only click on labeled elements. \nLabel IDs are in the following format with `x` being a number: `~x`\nThe labels are placed just above the bounding boxes so that they can be read clearly. \nResponse formats below.\n1. CLICK - If there is a label that gets you closer to the objective, go ahead and click it. \nResponse: {{ \"decision\": \"~decision here~\", \"reason\": \"~reason here~\", \"label\": \"~x\" }} "
},
{
"comment": "Code comments:\n1. Analyzes user's request and provides appropriate response options in JSON format.\n2. User needs to choose the ID based on context and not its position.\n3. IDs have no significance, they just serve as references for selecting options.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":160-182",
"content": "Here are examples of how to respond.\n__\nObjective: Follow up with the vendor in outlook\n{{ \"decision\": \"Click the Outlook send button\", \"reason\": \"I can see the email is already written and now I just need to send it.\", \"label\": \"~27\" }}\n__\nObjective: Play the Holiday music on YouTube\n{{ \"decision\": \"Click on the Play button\", \"reason\": \"It appears there is a row with a holiday song available in the Spotify UI\", \"label\": \"~3\" }}\n__\nA few important notes:\n- When navigating the web you'll need to click on the address bar first. Look closely to find the address bar's label it could be any number.\n- The IDs number has NO SIGNIFICANCE. For instance if ID is ~0 or ~1 it does not mean it is first or on top. CHOOSE THE ID BASED ON THE CONTEXT OF THE IMAGE AND IF IT HELPS REACH THE OBJECTIVE. \n- Do not preappend with ```json, just return the JSON object.\n{objective}\n\"\"\"\n# -------------------------\n# SUMMARY PROMPT\n# -------------------------\nSUMMARY_PROMPT = \"\"\"\nYou are a Self-Operating Computer. A user request has been executed. Present the results succinctly."
},
{
"comment": "This code defines two functions, `format_summary_prompt` and `format_vision_prompt`, which format prompts for summarizing the outcomes of a task and providing vision guidance based on previous actions taken. The `objective` parameter is used to state the original objective, while `previous_action` is optional and used when there have been previous actions taken towards the objective. The purpose of these functions is to provide clear instructions or prompts for users to understand the progress and outcomes of a task.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":184-216",
"content": "Include the following key contexts of the completed request:\n1. State the original objective.\n2. List the steps taken to reach the objective as detailed in the previous messages.\n3. Reference the screenshot that was used.\nSummarize the actions taken to fulfill the objective. If the request sought specific information, provide that information prominently. NOTE: Address directly any question posed by the user.\nRemember: The user will not interact with this summary. You are solely reporting the outcomes.\nOriginal objective: {objective}\nDisplay the results clearly:\n\"\"\"\ndef format_summary_prompt(objective):\n \"\"\"\n Format the summary prompt\n \"\"\"\n prompt = SUMMARY_PROMPT.format(objective=objective)\n return prompt\ndef format_vision_prompt(objective, previous_action):\n \"\"\"\n Format the vision prompt\n \"\"\"\n if previous_action:\n previous_action = f\"Here was the previous action you took: {previous_action}\"\n else:\n previous_action = \"\"\n prompt = VISION_PROMPT.format(objective=objective, previous_action=previous_action)"
},
{
"comment": "These are functions for formatting different types of prompts used in a vision system. The first function formats an accurate mode vision prompt, the second formats a decision prompt, and the third formats a labeled image prompt. Each function takes specific arguments and formats them into predefined prompt templates.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/prompts.py\":217-251",
"content": " return prompt\ndef format_accurate_mode_vision_prompt(prev_x, prev_y):\n \"\"\"\n Format the accurate mode vision prompt\n \"\"\"\n width = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size[\"width\"]) * 100\n height = ((ACCURATE_PIXEL_COUNT / 2) / monitor_size[\"height\"]) * 100\n prompt = ACCURATE_MODE_VISION_PROMPT.format(\n prev_x=prev_x, prev_y=prev_y, width=width, height=height\n )\n return prompt\ndef format_decision_prompt(objective, previous_action):\n \"\"\"\n Format the vision prompt\n \"\"\"\n if previous_action:\n previous_action = f\"Here was the previous action you took: {previous_action}\"\n else:\n previous_action = \"\"\n prompt = DECISION_PROMPT.format(\n objective=objective, previous_action=previous_action\n )\n return prompt\ndef format_label_prompt(objective):\n \"\"\"\n Format the vision prompt\n \"\"\"\n prompt = LABELED_IMAGE_PROMPT.format(objective=objective)\n return prompt"
}
]
}
15 changes: 15 additions & 0 deletions docs/doc/9fff9f58-42fa-4645-ba08-ff921e78c97d.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"summary": "The configuration class manages settings like debug mode, API keys, and monitor size. It loads environment variables from .env file and initializes OpenAI client with provided API key. The OpenAI API base URL is set using an environment variable or current value.",
"details": [
{
"comment": "This code defines a configuration class for managing settings such as debug mode, OpenAI and Google API keys, and monitor size. It loads environment variables from .env file using dotenv library, initializes OpenAI client if the API key is provided, and returns it or None otherwise.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/settings.py\":0-35",
"content": "import os\nfrom dotenv import load_dotenv\nfrom openai import OpenAI\nclass Config:\n \"\"\"\n Configuration class for managing settings.\n Attributes:\n debug (bool): Flag indicating whether debug mode is enabled.\n openai_api_key (str): API key for OpenAI.\n google_api_key (str): API key for Google.\n monitor_size (dict): Dictionary containing the width and height of the monitor.\n \"\"\"\n def __init__(self):\n load_dotenv()\n self.debug = False\n self.openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n self.google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n self.monitor_size = {\n \"width\": 1920,\n \"height\": 1080,\n }\n def initialize_openai_client(self):\n \"\"\"\n Initializes and returns an OpenAI client with the configured API key.\n Returns:\n OpenAI or None: An instance of the OpenAI client if the API key is provided, else None.\n \"\"\"\n if self.openai_api_key:\n client = OpenAI()\n client.api_key = self.openai_api_key"
},
{
"comment": "Setting OpenAI API base URL from environment variable or using current value.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/settings.py\":36-38",
"content": " client.base_url = os.getenv(\"OPENAI_API_BASE_URL\", client.base_url)\n return client\n return None"
}
]
}
15 changes: 15 additions & 0 deletions docs/doc/b1287c19-5498-479a-9378-d07fb007c20f.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"summary": "The code uses the PromptStyle library to define styles for UI elements, checks terminal support for ANSI escape codes, and sets color variables based on this.",
"details": [
{
"comment": "This code defines styles for dialogs, buttons, and other UI elements using the PromptStyle library. It also checks if the terminal supports ANSI escape codes for colors and defines ANSI color codes accordingly.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/style.py\":0-33",
"content": "import sys\nimport platform\nimport os\nfrom prompt_toolkit.styles import Style as PromptStyle\n# Define style\nstyle = PromptStyle.from_dict(\n {\n \"dialog\": \"bg:#88ff88\",\n \"button\": \"bg:#ffffff #000000\",\n \"dialog.body\": \"bg:#44cc44 #ffffff\",\n \"dialog shadow\": \"bg:#003800\",\n }\n)\n# Check if on a windows terminal that supports ANSI escape codes\ndef supports_ansi():\n \"\"\"\n Check if the terminal supports ANSI escape codes\n \"\"\"\n plat = platform.system()\n supported_platform = plat != \"Windows\" or \"ANSICON\" in os.environ\n is_a_tty = hasattr(sys.stdout, \"isatty\") and sys.stdout.isatty()\n return supported_platform and is_a_tty\n# Define ANSI color codes\nANSI_GREEN = \"\\033[32m\" if supports_ansi() else \"\" # Standard green text\nANSI_BRIGHT_GREEN = \"\\033[92m\" if supports_ansi() else \"\" # Bright/bold green text\nANSI_RESET = \"\\033[0m\" if supports_ansi() else \"\" # Reset to default text color\nANSI_BLUE = \"\\033[94m\" if supports_ansi() else \"\" # Bright blue\nANSI_YELLOW = \"\\033[33m\" if supports_ansi() else \"\" # Standard yellow text"
},
{
"comment": "Checks if the terminal supports ANSI escape codes and sets color variables accordingly.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/style.py\":34-35",
"content": "ANSI_RED = \"\\033[31m\" if supports_ansi() else \"\"\nANSI_BRIGHT_MAGENTA = \"\\033[95m\" if supports_ansi() else \"\" # Bright magenta text"
}
]
}
40 changes: 40 additions & 0 deletions docs/doc/cbc91c29-4de1-4c25-8c4e-e6fc470ad10b.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"summary": "The Self-Operating Computer Framework is a multimodal model project that enhances computer operation similar to humans, focusing on improving mouse click predictions and API access. It is compatible with Mac OS, Windows, and Linux (with X server installed), and requires at least $5 in API credits for the gpt-4-vision-preview model.",
"details": [
{
"comment": "\"Self-Operating Computer Framework, a framework for multimodal models to operate a computer like a human.\"",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":0-25",
"content": "<h1 align=\"center\">Self-Operating Computer Framework</h1>\n<p align=\"center\">\n <strong>A framework to enable multimodal models to operate a computer.</strong>\n</p>\n<p align=\"center\">\n Using the same inputs and outputs as a human operator, the model views the screen and decides on a series of mouse and keyboard actions to reach an objective. \n</p>\n<div align=\"center\">\n <img src=\"https://github.com/OthersideAI/self-operating-computer/blob/main/readme/self-operating-computer.png\" width=\"750\" style=\"margin: 10px;\"/>\n</div>\n<!--\n:rotating_light: **OUTAGE NOTIFICATION: gpt-4-vision-preview**\n**This model is currently experiencing an outage so the self-operating computer may not work as expected.**\n-->\n## Key Features\n- **Compatibility**: Designed for various multimodal models.\n- **Integration**: Currently integrated with **GPT-4v** as the default model, with extended support for Gemini Pro Vision.\n- **Future Plans**: Support for additional models.\n## Current Challenges\n> **Note:** GPT-4V's error rate in est"
},
{
"comment": "This code is a brief overview of the \"self-operating-computer\" project, focusing on the development of the Agent-1-Vision multimodal model for improved mouse click location predictions. It also mentions the upcoming API access and the plans to improve hotkey-based functionality over time.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":25-36",
"content": "imating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.\n## Ongoing Development\nAt [HyperwriteAI](https://www.hyperwriteai.com/), we are developing Agent-1-Vision a multimodal model with more accurate click location predictions.\n## Agent-1-Vision Model API Access\nWe will soon be offering API access to our Agent-1-Vision model.\nIf you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com).\n### Additional Thoughts\nWe recognize that some operating system functions may be more efficiently executed with hotkeys such as entering the Browser Address bar using `command + L` rather than by simulating a mouse click at the correct XY location. We plan to make these improvements over time. However, it's important to note that many actions require the accurate selection of visual"
},
{
"comment": "This code explains that the primary focus of the project is refining the accuracy of determining mouse click locations, which is essential for a fully self-operating computer. It also provides links to a demo and quick start instructions for setting up the Self-Operating Computer Framework locally on your computer.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":36-66",
"content": " elements on the screen, necessitating precise XY mouse click locations. A primary focus of this project is to refine the accuracy of determining these click locations. We believe this is essential for achieving a fully self-operating computer in the current technological landscape.\n## Demo\nhttps://github.com/OthersideAI/self-operating-computer/assets/42594239/9e8abc96-c76a-46fb-9b13-03678b3c67e0\n## Quick Start Instructions\nBelow are instructions to set up the Self-Operating Computer Framework locally on your computer.\n### Option 1: Traditional Installation\n1. **Clone the repo** to a directory on your computer:\n```\ngit clone https://github.com/OthersideAI/self-operating-computer.git\n```\n2. **Cd into directory**:\n```\ncd self-operating-computer\n```\n3. **Create a Python virtual environment**. [Learn more about Python virtual environment](https://docs.python.org/3/library/venv.html).\n```\npython3 -m venv venv\n```\n4. **Activate the virtual environment**:\n```\nsource venv/bin/activate\n```\n5. **Install Project Requi"
},
{
"comment": "Code snippet 1:\n```python\npip install self-operating-computer\n```\nInstall the project directly from PyPI.\n\nCode snippet 2:\n```bash\nmv .example.env .env\n```\nRename `.example.env` to `.env`.\n\nCode snippet 3:\n```bash\nOPERAI_API_KEY='your-key-here'\n```\nAdd your Open AI key in the new `.env` file.\n\nCode snippet 4:\n```bash\noperate\n```\nRun the program!\n\nCode snippet 5:\nFinal step, Mac users grant permission for \"Screen Recording\" and \"Accessibility\".",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":66-87",
"content": "rements and Command-Line Interface: Instead of using `pip install .`, you can now install the project directly from PyPI with:**\n```\npip install self-operating-computer\n```\n6. **Then rename the `.example.env` file to `.env` so that you can save your OpenAI key in it.**\n```\nmv .example.env .env\n``` \n7. **Add your Open AI key to your new `.env` file. If you don't have one, you can obtain an OpenAI key [here](https://platform.openai.com/account/api-keys)**:\n```\nOPENAI_API_KEY='your-key-here'\n```\n8. **Run it**!\n```\noperate\n```\n9. **Final Step**: As a last step, the Terminal app will ask for permission for \"Screen Recording\" and \"Accessibility\" in the \"Security & Privacy\" page of Mac's \"System Preferences\".\n<div align=\"center\">\n <img src=\"https://github.com/OthersideAI/self-operating-computer/blob/main/readme/terminal-access-1.png\" width=\"300\" style=\"margin: 10px;\"/>\n <img src=\"https://github.com/OthersideAI/self-operating-computer/blob/main/readme/terminal-access-2.png\" width=\"300\" style=\"margin: 10px;\"/>"
},
{
"comment": "This code provides instructions for installing the Self Operating Computer Framework using a .sh script. It also explains how to add and use Google's `gemini-pro-vision` model within the framework.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":88-123",
"content": "</div>\n### Option 2: Installation using .sh script\n1. **Clone the repo** to a directory on your computer:\n```\ngit clone https://github.com/OthersideAI/self-operating-computer.git\n```\n2. **Cd into directory**:\n```\ncd self-operating-computer\n```\n3. **Run the installation script**: \n```\n./run.sh\n```\n## Using `operate` Modes\n### Multimodal Models `-m`\nAn additional model is now compatible with the Self Operating Computer Framework. Try Google's `gemini-pro-vision` by following the instructions below. \n**Add your Google AI Studio API key to your .env file.** If you don't have one, you can obtain a key [here](https://makersuite.google.com/app/apikey) after setting up your Google AI Studio account. You may also need [authorize credentials for a desktop application](https://ai.google.dev/palm_docs/oauth_quickstart). It took me a bit of time to get it working, if anyone knows a simpler way, please make a PR:\n```\nGOOGLE_API_KEY='your-key-here'\n```\nStart `operate` with the Gemini model\n```\noperate -m gemini-pro-vision\n```"
},
{
"comment": "This code is providing instructions on how to enable voice mode in the self-operating-computer framework. The user must install additional audio requirements and device dependencies, then run the operate command with the --voice flag. Contributions are welcomed, and feedback or questions can be directed to Josh on Twitter. Joining the Discord community is also encouraged for real-time discussions and support.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":125-158",
"content": "### Voice Mode `--voice`\nThe framework supports voice inputs for the objective. Try voice by following the instructions below. \nInstall the additional `requirements-audio.txt`\n```\npip install -r requirements-audio.txt\n```\n**Install device requirements**\nFor mac users:\n```\nbrew install portaudio\n```\nFor Linux users:\n```\nsudo apt install portaudio19-dev python3-pyaudio\n```\nRun with voice mode\n```\noperate --voice\n```\n## Contributions are Welcomed!:\nIf you want to contribute yourself, see [CONTRIBUTING.md](https://github.com/OthersideAI/self-operating-computer/blob/main/CONTRIBUTING.md).\n## Feedback\nFor any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter. \n## Join Our Discord Community\nFor real-time discussions and community support, join our Discord server. \n- If you're already a member, join the discussion in [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).\n- If you're new, first [join our Discord Server"
},
{
"comment": "Join the Discord server and visit #self-operating-computer channel. Follow HyperWriteAI for updates, compatible with Mac OS, Windows, and Linux (with X server installed). The gpt-4-vision-preview model requires at least $5 in API credits.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/README.md\":158-171",
"content": "](https://discord.gg/YqaKtyBEzM) and then navigate to the [#self-operating-computer](https://discord.com/channels/877638638001877052/1181241785834541157).\n## Follow HyperWriteAI for More Updates\nStay updated with the latest developments:\n- Follow HyperWriteAI on [Twitter](https://twitter.com/HyperWriteAI).\n- Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/).\n## Compatibility\n- This project is compatible with Mac OS, Windows, and Linux (with X server installed).\n## OpenAI Rate Limiting Note\nThe ```gpt-4-vision-preview``` model is required. To unlock access to this model, your account needs to spend at least \\$5 in API credits. Pre-paying for these credits will unlock access if you haven't already spent the minimum \\$5. \nLearn more **[here](https://platform.openai.com/docs/guides/rate-limits?context=tier-one)**"
}
]
}
25 changes: 25 additions & 0 deletions docs/doc/cbeb3e57-4692-4372-a31c-79a79afa76cd.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"summary": "The summary is about a code that involves text input, search execution, and mouse clicks using specified coordinates, as well as a circular movement function with start/end points, radius, and duration, and a get_last_assistant_message function to retrieve the last assistant message from an array.",
"details": [
{
"comment": "Code comments:\n- `keyboard_type(text)` - Types the given text using keyboard and returns a message indicating typed text.\n- `search(text)` - Searches for program or file by typing in search bar and pressing Enter. Returns a message indicating the program or file has been opened.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/os.py\":0-43",
"content": "import pyautogui\nimport platform\nimport time\nimport math\nfrom operate.utils.misc import convert_percent_to_decimal\ndef keyboard_type(text):\n \"\"\"\n Types the given text using the keyboard.\n Args:\n text (str): The text to be typed.\n Returns:\n str: A message indicating the typed text.\n \"\"\"\n text = text.replace(\"\\\\n\", \"\\n\")\n for char in text:\n pyautogui.write(char)\n pyautogui.press(\"enter\")\n return \"Type: \" + text\ndef search(text):\n \"\"\"\n Searches for a program or file by typing the given text in the search bar and pressing Enter.\n Args:\n text (str): The text to be searched.\n Returns:\n str: A message indicating that the program or file has been opened.\n \"\"\"\n if platform.system() == \"Windows\":\n pyautogui.press(\"win\")\n elif platform.system() == \"Linux\":\n pyautogui.press(\"win\")\n else:\n # Press and release Command and Space separately\n pyautogui.keyDown(\"command\")\n pyautogui.press(\"space\")\n pyautogui.keyUp(\"command\")"
},
{
"comment": "Line 45-48: Type the text by pressing each character\nLine 49: Press enter after typing the text\nLine 50-79: Perform a mouse click at the specified coordinates\nLine 80-101: Click the program based on the given description",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/os.py\":45-84",
"content": " time.sleep(1)\n # Now type the text\n for char in text:\n pyautogui.write(char)\n pyautogui.press(\"enter\")\n return \"Open program: \" + text\ndef click(click_detail):\n \"\"\"\n Perform a mouse click at the specified coordinates.\n Args:\n click_detail (dict): A dictionary containing the coordinates of the click.\n Returns:\n str: The description of the click if successful, otherwise \"We failed to click\".\n \"\"\"\n try:\n x = convert_percent_to_decimal(click_detail[\"x\"])\n y = convert_percent_to_decimal(click_detail[\"y\"])\n if click_detail and isinstance(x, float) and isinstance(y, float):\n click_at_percentage(x, y)\n return click_detail[\"description\"]\n else:\n return \"We failed to click\"\n except Exception as e:\n print(f\"Error parsing JSON: {e}\")\n return \"We failed to click\"\ndef click_at_percentage(\n x_percentage, y_percentage, duration=0.2, circle_radius=50, circle_duration=0.5\n):\n \"\"\"\n Moves the m"
},
{
"comment": "Moves the cursor to a specific percentage of the screen and then performs a circular movement before clicking.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/os.py\":84-104",
"content": "ouse cursor to a specified percentage of the screen and performs a circular movement before clicking.\n Args:\n x_percentage (float): The x-coordinate percentage of the screen to move the cursor to.\n y_percentage (float): The y-coordinate percentage of the screen to move the cursor to.\n duration (float, optional): The duration (in seconds) of the smooth cursor movement. Defaults to 0.2.\n circle_radius (int, optional): The radius of the circular movement. Defaults to 50.\n circle_duration (float, optional): The duration (in seconds) of the circular movement. Defaults to 0.5.\n Returns:\n str: A message indicating that the click was successful.\n \"\"\"\n # Get the size of the primary monitor\n screen_width, screen_height = pyautogui.size()\n # Calculate the x and y coordinates in pixels\n x_pixel = int(screen_width * float(x_percentage))\n y_pixel = int(screen_height * float(y_percentage))\n # Move to the position smoothly\n pyautogui.moveTo(x_pixel, y_pixel, duration=duration)"
},
{
"comment": "For the code provided, here are some brief comments:\n\n1. The function is for circular movement, which takes start and end points as input parameters (x_pixel, y_pixel), circle radius, and duration. It calculates the intermediate position by using time elapsed and performs a circular movement towards the destination point.\n2. In the get_last_assistant_message function, it retrieves the last message from the assistant in the messages array. If the last assistant message is the first message in the array, return None. Otherwise, return the last assistant message.",
"location": "\"/media/root/Toshiba XG3/works/self-operating-computer/docs/src/operate/utils/os.py\":106-130",
"content": " # Circular movement\n start_time = time.time()\n while time.time() - start_time < circle_duration:\n angle = ((time.time() - start_time) / circle_duration) * 2 * math.pi\n x = x_pixel + math.cos(angle) * circle_radius\n y = y_pixel + math.sin(angle) * circle_radius\n pyautogui.moveTo(x, y, duration=0.1)\n # Finally, click\n pyautogui.click(x_pixel, y_pixel)\n return \"Successfully clicked\"\ndef get_last_assistant_message(messages):\n \"\"\"\n Retrieve the last message from the assistant in the messages array.\n If the last assistant message is the first message in the array, return None.\n \"\"\"\n for index in reversed(range(len(messages))):\n if messages[index][\"role\"] == \"assistant\":\n if index == 0: # Check if the assistant message is the first in the array\n return None\n else:\n return messages[index]\n return None # Return None if no assistant message is found"
}
]
}
Loading