diff --git a/local_model.sh b/local_model.sh new file mode 100644 index 0000000..9dd1e71 --- /dev/null +++ b/local_model.sh @@ -0,0 +1 @@ +ssh -L 18001:localhost:18001 ec2-user@ec2-35-88-109-159.us-west-2.compute.amazonaws.com diff --git a/oldjeff.pem b/oldjeff.pem new file mode 100644 index 0000000..d7ceacc --- /dev/null +++ b/oldjeff.pem @@ -0,0 +1,27 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIEogIBAAKCAQEAxjnHtkAbblUvAl25H/wcOARtDFIUcI5ZyT2FpDvNijqEgfhy +bqLfSehfKY+N+qnOOXmxy1Wi1irbLEl/AhdTsJGZ9gNMEZdq66sq5hwIlpbweMsQ +swkclAScQHIMl9NaArLkOthcygnuswqTmvwPirNdmfm+Bt1IBrNCOrCA0elf4rUv +mpyQW6ANNW7RmVi8d8wXBhHrnij738ZHl7hC/noD9MEsWNfaGMQNng2rBvOgLgVY +OgxHIlp/74lnuflpPqmYnURKF3QkRJ82unY/TdFI71lKqypt93rn4pAw8HX9QqaC +X57mVWdlmQafCenQgHHWgJ9yYnQTam0s/+fNmwIDAQABAoIBAHLF3ca+k6Nsmw1p +qtjEJqqglWs+0yrgoUgN4SVYowfYHgULD2bT0yl97CuqPPDYBNnuhm1PJjuPENwx +qeJSE1j21QhGnHLLE1NlBi+6J5bZyl6GZSLksbFaggYmgvgdnc5WOiOARymMWrM7 ++n8QVwdeF2Ih4k8jLKMEg+JrdAsVaE9AfqrpGMKYJb18fVjj1RGf0q14mh2mTSeJ +dj8H5h8N8Fb6mSR+t7XLLn5uNufe/9R204H2CUYuRPN+AOW9xSLJbokNLlY5To2R +G+uiM0xJVrWvbFPxAtaTiI/nO5Oac6OUKmKI/5hBdQQ5qTkFeHsDNn8IzcCd2gzS +rgn0CYECgYEA/1SvoxKX5l/34nlcCcMAmyG4zKCUvdQPfoAIqhn2FBrHumBoBO4n +z1bLKaeZ9zdpVfI/P5TKjTawsn4P0QwExgxpJ9oZaLNokeRzW5GAZ87Cp6wzXbX8 +/1GSPyoCIyMY4YeHxYtss0O7CTtatLn+NnzsgHPO3H7RbgGf+E8ek2ECgYEAxr7H +iUwLRxiQ47XyMJKyVIuYcxh9dwpc/TTXvSRk9Abz7yvQENKEIodW+WfM+cHRj69S +fVnVsbA+azCbvIPk1NikTkIECeMK2fZ/XeoOSNi2gFinpVeICBCNylzHIdjRprh9 +gv5n6eb8/8DJFy9RIA/h4w/vtU3yv6zomp/1vnsCgYAShYAgIi8mpPuEUC1e/+hB +WJbhMRzZEFL3aC44uJ1jI/YtDOU+xk/Y+IDQSroedsSLWYFBCXgP+lGjAQYAshB+ +lVPjciy5rZn+S0Ya9FkOLq9sHk+zkooBs1cagd+Z0OfzJDOzHsQJ1PXyW33e8kcA +iNtXDg+JayGiCzgheQvMwQKBgG/ulpZ+24MpFMEKgeJVXFY9YJjB3DelAIYisrZ1 +vt2o5M140XAIAB8qNhO1ID4xqILR7RVn+PBgIGdiMvPTHJe7g54HlBq1YjEroMQV +xAHG+9IBHDoEuDpCiHjGE+i+IiVRlm6mNYQIccjgnOCP55K1HzUwjoJ/6g2FpmMf +X9ntAoGAWXT0yWMm4sCGbo4vd/jyPQPFnqWHODdqCmkL5vYMDV+CJXLioByDtFlh +lDQErdAFqjQdaoYBg2wxUwdBKzvz4b9HUstxg6MAVRKm3w+OgeendzO1t8h0c7ru +P/216mpn4Au0kPe5AMhy1339Qdod86QEztDtRXIaWk/mczpxtkc= +-----END RSA PRIVATE KEY----- \ No newline at end of file diff --git a/scripts/run-local.sh b/scripts/run-local.sh old mode 100644 new mode 100755 index 100382e..44b23ec --- a/scripts/run-local.sh +++ b/scripts/run-local.sh @@ -5,6 +5,8 @@ set -e source ./shared.sh +export OPENAI_API_KEY=empty + # Default parameters mode="azure" # Default to azure if no argument is provided prepare_image=false @@ -22,7 +24,7 @@ browser_port=8006 rdp_port=3390 start_client=true agent="navi" -model="gpt-4-vision-preview" +model="Qwen/Qwen2.5-VL-72B-Instruct" som_origin="oss" a11y_backend="uia" gpu_enabled=false diff --git a/scripts/run.sh b/scripts/run.sh old mode 100644 new mode 100755 diff --git a/src/win-arena-container/client/desktop_env/controllers/python.py b/src/win-arena-container/client/desktop_env/controllers/python.py index 596b752..8c9f7d4 100644 --- a/src/win-arena-container/client/desktop_env/controllers/python.py +++ b/src/win-arena-container/client/desktop_env/controllers/python.py @@ -174,15 +174,17 @@ def execute_python_windows_command(self, command: str) -> None: } - try: - response = requests.post(self.http_server + "/execute_windows", headers=headers, json=payload, timeout=90) - if response.status_code == 200: - logger.info("Command executed successfully: %s", response.text) - else: - logger.error("Failed to execute command. Status code: %d", response.status_code) - return response.json() - except requests.exceptions.RequestException as e: - logger.error("An error occurred while trying to execute the command: %s", e) + # try: + logger.info("execute_python_windows_command self.http_server: " + str(self.http_server)) + response = requests.post(self.http_server + "/execute_windows", headers=headers, json=payload, timeout=90) + logger.info("execute_python_windows_command response: " + str(response)) + if response.status_code == 200: + logger.info("Command executed successfully: %s", response.text) + else: + logger.error("Failed to execute command. Status code: %d", response.status_code) + return response.json() + # except requests.exceptions.RequestException as e: + # logger.error("An error occurred while trying to execute the command: %s", e) def execute_python_command(self, command: str) -> None: """ diff --git a/src/win-arena-container/client/hello_world.py b/src/win-arena-container/client/hello_world.py new file mode 100644 index 0000000..8a83f58 --- /dev/null +++ b/src/win-arena-container/client/hello_world.py @@ -0,0 +1,3 @@ + +while True: + print("Hello World") \ No newline at end of file diff --git a/src/win-arena-container/client/lib_run_single.py b/src/win-arena-container/client/lib_run_single.py index a1146a4..e321488 100644 --- a/src/win-arena-container/client/lib_run_single.py +++ b/src/win-arena-container/client/lib_run_single.py @@ -6,6 +6,7 @@ import time import traceback from trajectory_recorder import TrajectoryRecorder +import pyfile_inject_execute logger = logging.getLogger("desktopenv.experiment") @@ -16,11 +17,17 @@ time_limit = data["time_limit"] def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores): + logger.info("run_single_example start...") agent.reset() obs = env.reset(task_config=example) done = False step_idx = 0 + # inject and execute + logger.info("pyfile_inject_execute.inject_and_execute starts") + pyfile_inject_execute.inject_and_execute(envV = env, controllerV=env.controller) + logger.info("pyfile_inject_execute.inject_and_execute ends") + #env.controller.start_recording() start_time = datetime.datetime.now() @@ -39,10 +46,12 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl continue logger.info("Agent: Thinking...") + pyfile_inject_execute.notify(envV = env, controllerV=env.controller, data = "before agent.predict") response, actions, logs, computer_update_args = agent.predict( instruction, obs ) + pyfile_inject_execute.notify(envV = env, controllerV=env.controller, data = "after agent.predict") # update the computer object, used by navi's action space if computer_update_args: @@ -55,7 +64,9 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl elapsed_timestamp = f"{datetime.datetime.now() - start_time}" logger.info("Step %d: %s", step_idx + 1, action) + pyfile_inject_execute.notify(envV = env, controllerV=env.controller, data = "before env.step") obs, reward, done, info = env.step(action, args.sleep_after_execution) + pyfile_inject_execute.notify(envV = env, controllerV=env.controller, data = "after env.step") logger.info("Reward: %.2f", reward) logger.info("Done: %s", done) @@ -89,4 +100,6 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl # Record final results recorder.record_end(result, start_time) - # env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) \ No newline at end of file + # env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) + + logger.info("run_single_example end...") \ No newline at end of file diff --git a/src/win-arena-container/client/mm_agents/navi/agent.py b/src/win-arena-container/client/mm_agents/navi/agent.py index c362469..d952d9a 100644 --- a/src/win-arena-container/client/mm_agents/navi/agent.py +++ b/src/win-arena-container/client/mm_agents/navi/agent.py @@ -3,6 +3,7 @@ import re from typing import Dict, List # from mm_agents.planner.computer import Computer, WindowManager +from mm_agents.navi.gpt.openAI_planner import OpenAI_Planner from mm_agents.navi.gpt.gpt4v_planner import GPT4V_Planner from mm_agents.navi.gpt import planner_messages import copy @@ -113,9 +114,12 @@ def __init__( if model == 'phi3-v': from mm_agents.navi.gpt.phi3_planner import Phi3_Planner - self.gpt4v_planner = Phi3_Planner(server='azure',model='phi3-v',temperature=temperature) + # self.gpt4v_planner = Phi3_Planner(server='azure',model='phi3-v',temperature=temperature) + self.gpt4v_planner = OpenAI_Planner(temperature=temperature) else: - self.gpt4v_planner = GPT4V_Planner(server=self.server, model=self.model, temperature=temperature) + # self.gpt4v_planner = GPT4V_Planner(server=self.server, model=self.model, temperature=temperature) + self.gpt4v_planner = OpenAI_Planner(temperature=temperature) + if use_last_screen: self.gpt4v_planner.system_prompt = planner_messages.planning_system_message_shortened_previmg @@ -398,11 +402,16 @@ def predict(self, instruction: str, obs: Dict) -> List: image_prompts = [last_image, image_prompt_resized] # send to gpt - logger.info("Thinking...") + logger.info("Thinking... model:"+self.gpt4v_planner.model) + logger.info("OpenAI info 1: "+self.gpt4v_planner.gpt4v.model) + logger.info("OpenAI info 2: "+str(self.gpt4v_planner.gpt4v.client.base_url)) + logger.info("OpenAI info 3: "+str(self.gpt4v_planner.gpt4v.client.api_key)) plan_result = self.gpt4v_planner.plan(image_prompts, user_question) logs['plan_result'] = plan_result + logger.info("plan_result: "+str(plan_result)) + # extract the textual memory block memory_block = re.search(r'```memory\n(.*?)```', plan_result, re.DOTALL) if memory_block: diff --git a/src/win-arena-container/client/mm_agents/navi/gpt/gpt4v_oai.py b/src/win-arena-container/client/mm_agents/navi/gpt/gpt4v_oai.py index cda9672..6100ddf 100644 --- a/src/win-arena-container/client/mm_agents/navi/gpt/gpt4v_oai.py +++ b/src/win-arena-container/client/mm_agents/navi/gpt/gpt4v_oai.py @@ -18,13 +18,15 @@ class GPT4VisionOAI: - def __init__(self, model="gpt-4o"): + # def __init__(self, model="gpt-4o"): + def __init__(self, model="ByteDance-Seed/UI-TARS-1.5-7B"): self.model = model #oad key from environment variable self.api_key = os.getenv("OPENAI_API_KEY") if self.api_key is None: - print("API key not found in environment variable.") - self.client = openai.OpenAI(api_key=self.api_key) + print("API key not found in environment variable. Setting to 'empty'.") + self.api_key = "empty" + self.client = openai.OpenAI(api_key=self.api_key,base_url="http://ec2-35-88-109-159.us-west-2.compute.amazonaws.com:18001/v1", max_retries=0) def encode_image(self, image: Union[str, Image.Image]) -> str: if isinstance(image, str): @@ -55,7 +57,7 @@ def get_base64_payload(self, base64_image: str, detail="auto") -> dict: } } - @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(30)) + # @retry(wait=wait_random_exponential(min=1, max=1), stop=stop_after_attempt(30)) def process_images(self, system_prompt: str, question: str, images: Union[str, Image.Image, List[Union[str, Image.Image]]], detail="auto", max_tokens=300, temperature=1.0, only_text=True, format="JPEG") -> str: if system_prompt==None: @@ -73,14 +75,19 @@ def process_images(self, system_prompt: str, question: str, images: Union[str, I base64_image = self.encode_image(image) content.append(self.get_base64_payload(base64_image, detail=detail)) + print("gpt4voai model: "+self.model) + print("gpt4voai client info 1: "+str(self.client.api_key)) + print("gpt4voai client info 2: "+str(self.client.base_url)) + print("gpt4voai client info 3: "+str(self.client.max_retries)) + response = self.client.chat.completions.create( # model="gpt-4-vision-preview", model=self.model, messages=[ - { - "role": "system", - "content": system_prompt - }, + # { + # "role": "system", + # "content": system_prompt + # }, { "role": "user", "content": content @@ -102,7 +109,7 @@ def main(): system_prompt = "You are a helpful assistant." # SINGLE RESOURCE - gpt4v_wrapper = GPT4VisionOAI(model="gpt-4-1106-vision-preview") + gpt4v_wrapper = GPT4VisionOAI(model="ByteDance-Seed/UI-TARS-1.5-7B") # process a single image start_time = time.time() diff --git a/src/win-arena-container/client/mm_agents/navi/gpt/openAI_planner.py b/src/win-arena-container/client/mm_agents/navi/gpt/openAI_planner.py new file mode 100644 index 0000000..f936a75 --- /dev/null +++ b/src/win-arena-container/client/mm_agents/navi/gpt/openAI_planner.py @@ -0,0 +1,58 @@ +import os +import inspect +import re +from mm_agents.navi.gpt import gpt4v_azure, gpt4v_oai, system_messages, planner_messages +import tiktoken +import time +import json +import re + +class OpenAI_Planner(): + def __init__(self, server="azure", model="ByteDance-Seed/UI-TARS-1.5-7B", temperature=1.0): + self.server = server + self.model = model + self.temperature = temperature + + # if self.server=="azure": + # self.gpt4v = gpt4v_azure.GPT4VisionAzure() + # elif self.server=="oai": + # self.gpt4v = gpt4v_oai.GPT4VisionOAI(self.model) + # else: + # raise ValueError(f"Server {server} not supported") + self.gpt4v = gpt4v_oai.GPT4VisionOAI(model=self.model) + + # set the initial system message + self.system_prompt = planner_messages.planning_system_message + + def plan(self, images, user_query): + response = self.gpt4v.process_images(self.system_prompt, user_query, images, max_tokens=4096, temperature=self.temperature, only_text=True) + return response + + def describe_elements(self, screenshot, crops, descriptions=None) -> str: + n = len(crops) + system_prompt = f"you will be presented with crops of interactive element in the screen and a screenshot marked with red bounding-boxes. Your task is to describe each element and infer its function. A single crop may contain multiple elements, if so, describe them all in a single paragraph. You must provide one description for each of the {n} elements provided." + + user_query = f"Given the element and screenshot. what could be the purpose of these {n} elements? The last image is the screenshot with the elements marked with red bounding boxes." + + print(system_prompt) + print(user_query) + + r = self.gpt4v.process_images(system_prompt, user_query, crops+[screenshot], max_tokens=4096, temperature=0.0, only_text=True) + + # display(Image.open(screenshot_tagged)) + print(r) + + + structuring_prompt = "Given descriptions of the elements/images, format into a json with the element index as key and values as summarized descriptions. if a single description references to multiple elements, break it down into the appropriate items. Make sure to remove from the descriptons any references to the element index. e.g input \n'Here's a description of the elements\n 1. The first icon looks liek a bell indicating... \n2. The second and third elements represent a magnifying glass...\n3. it appears to be a ball' output: ```json\n{'1':'A bell indicating...', '2':'A magnifying glass...','3':'A magnifying glass...', '4':'ball' ...}```." + user_query= f"Structure the following text into a json with descriprions of the {n} elements/images. \n'" + r + "\n'" + formatted = self.gpt4v.process_images(structuring_prompt, user_query,[], max_tokens=4096, temperature=0.0, only_text=True) + print(formatted) + try: + # extract code block + formatted = re.search(r"```json\n(.*)```", formatted, re.DOTALL).group(1) + result = json.loads(formatted) + except Exception as e: + print(f"{formatted}\n\n\nFailed to extract json from response: {e}") + result = {} + print(result) + return result \ No newline at end of file diff --git a/src/win-arena-container/client/pyfile_inject_execute.py b/src/win-arena-container/client/pyfile_inject_execute.py new file mode 100644 index 0000000..c9e5c19 --- /dev/null +++ b/src/win-arena-container/client/pyfile_inject_execute.py @@ -0,0 +1,86 @@ + +import os +import os.path +import logging +import requests +import json +from requests_toolbelt.multipart.encoder import MultipartEncoder + +from desktop_env.controllers.python import PythonController +from desktop_env.envs.desktop_env import DesktopEnv + + +logger = logging.getLogger("inject_and_execute") + +def inject_and_execute(envV: DesktopEnv, controllerV: PythonController): + print("1") + # prepare basic infos + pyFile = "hello_world.py" + local_path: str = "./"+pyFile + path: str = "C:/Users/Docker/Desktop/" + pyFile + # action = "python "+path + # has terminal window overlap + # action = "import subprocess; subprocess.Popen('python "+path+"',creationflags=subprocess.CREATE_NEW_CONSOLE);" + # no terminal window overlap + action = "import subprocess; subprocess.Popen('pythonw "+path+"',creationflags=subprocess.CREATE_NEW_CONSOLE);" + + # inject execute py file through upload post request + if not os.path.exists(local_path): + print(f"Setup Upload - Invalid local path ({local_path}).") + return + + print("2") + form = MultipartEncoder({ + "file_path": path, + "file_data": (os.path.basename(path), open(local_path, "rb")) + }) + headers = {"Content-Type": form.content_type} + print(form.content_type) + + ## send request to server to upload file + http_server = f"http://{controllerV.vm_ip}:5000" + + logger.info("upload py file 4 injection: "+str(form)) + + print("3") + try: + print("REQUEST ADDRESS: %s", http_server + "/setup" + "/upload") + print("REQUEST FORM: "+str(form)) + response = requests.post(http_server + "/setup" + "/upload", headers=headers, data=form) + if response.status_code == 200: + print("Command executed successfully: " + response.text) + else: + print("Failed to upload file. Status code: " + response.text) + except requests.exceptions.RequestException as e: + print("An error occurred while trying to send the request: " + e) + + # execute py file through execute_python_windows_command post request + print("4") + print("controllerV.execute_python_windows_command: "+str(action)) + controllerV.execute_python_command(action) + # controllerV.execute_command_new_terminal(action) + print("5") + + +def notify(envV: DesktopEnv, controllerV: PythonController, data: str): + # prepare basic infos + pyFile = "notify.txt" + local_path: str = "./"+pyFile + path: str = "C:/Users/Docker/Desktop/" + pyFile + http_server = f"http://{controllerV.vm_ip}:5000" + + + payload = json.dumps({"file_path": path, "file_data": data}) + headers = {'Content-Type': 'application/json'} + + try: + print("REQUEST ADDRESS: %s", http_server + "/notify") + print("REQUEST FORM: "+str(payload)) + response = requests.post(http_server + "/notify", headers=headers, data=payload, timeout=90) + if response.status_code == 200: + print("Command executed successfully: " + response.text) + else: + print("Failed to notify. Status code: " + response.text) + except requests.exceptions.RequestException as e: + print("An error occurred while trying to send the request: " + e) + diff --git a/src/win-arena-container/vm/setup/server/main.py b/src/win-arena-container/vm/setup/server/main.py index 8d218ff..bf3a861 100644 --- a/src/win-arena-container/vm/setup/server/main.py +++ b/src/win-arena-container/vm/setup/server/main.py @@ -170,7 +170,7 @@ def execute_command_windows(): data = request.get_json() # shell = data.get('shell', False) command = data.get('command') - print(command) + print("/execute_windows: "+command) try: # exec(command_with_computer) @@ -235,6 +235,45 @@ def execute_command(): # 'data': data # }), 500 +@app.route('/notify', methods=['POST']) +def notify_env_agent(): + + data = request.json + # The 'command' key in the JSON request should contain the command to be executed. + file_data_ = data.get('file_data', ">>>>>>> no data in post <<<<<<<") + file_path_ = data.get('file_path', "C:/Users/Docker/Desktop/temp.txt") + + try: + + # find file from file_path + file_path = os.path.expandvars(os.path.expanduser(file_path_)) + + # if not exist, then create a new file; else write to file + if not os.path.exists(file_path): + with open(file_path, "w") as file: + file.write("created new notify logger file\n") + + # Write the new data to the file + with open(file_path, "a") as file: + file.write(file_data_+"\n") + + return jsonify({ + 'status': 'notify and write success', + 'error': '' + }) + except Exception as e: + logger.error("\n" + traceback.format_exc() + "\n") + return jsonify({ + 'status': 'error', + 'message': str(e) + }), 500 + # return jsonify({ + # 'status': 'error', + # 'message': str(e), + # 'shell': str(shell), + # 'command': command, + # 'data': data + # }), 500 def _get_machine_architecture() -> str: """ Get the machine architecture, e.g., x86_64, arm64, aarch64, i386, etc. diff --git a/test_claude.py b/test_claude.py new file mode 100644 index 0000000..63df45d --- /dev/null +++ b/test_claude.py @@ -0,0 +1,14 @@ +import requests + +url = "http://ec2-35-88-109-159.us-west-2.compute.amazonaws.com:18001/v1/chat/completions" + +payload = { + "model":"ByteDance-Seed/UI-TARS-1.5-7B", # "sonnet4", #"sonnet37v1", + "messages": [ + {"role": "user", "content": "What's the difference between Claude 3.5 and GPT-4o?"} + ], + "temperature": 0.7 +} + +response = requests.post(url, json=payload) +print(response.json()) \ No newline at end of file