-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgpt4v_screenshot_analyzer.py
executable file
·186 lines (155 loc) · 6.02 KB
/
gpt4v_screenshot_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
import subprocess
import base64
import requests
import threading
import keyboard
import time
import tkinter as tk
import os
from PIL import Image, ImageTk
# OpenAI API Key (be sure to set this environment variable before running the script)
api_key = os.getenv("OPENAI_API_KEY")
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Function to send image or follow-up message to GPT-4 Vision API and get the description
def get_image_description(base64_image, followup_message=None):
print("Sending request to GPT-4 Vision API...")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What’s in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
}
]
}
],
"max_tokens": 1000
}
if followup_message is not None:
payload["messages"] += followup_message
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
return response.json()
# Global variables for mouse and rectangle
start_x, start_y, end_x, end_y = 0, 0, 0, 0
rect_id = None
selection_window = None
# Update the on_click function to only capture coordinates
def on_click(x, y, button, pressed):
global start_x, start_y, end_x, end_y
if pressed:
start_x, start_y = x, y
else:
end_x, end_y = x, y
return False # Stop listener
def display_result(image_path, text):
def run_window():
root = tk.Tk()
root.title("GPT-4V Output")
window_x = start_x
window_y = start_y
# Display image
img = Image.open(image_path)
imgtk = ImageTk.PhotoImage(image=img)
img_label = tk.Label(root, image=imgtk)
img_label.image = imgtk
img_label.pack()
# Scrollable Text Widget for displaying text
text_frame = tk.Frame(root)
text_widget = tk.Text(text_frame, wrap='word', height=10)
scrollbar = tk.Scrollbar(text_frame, command=text_widget.yview)
text_widget.configure(yscrollcommand=scrollbar.set)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
text_widget.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
text_frame.pack(fill=tk.BOTH, expand=True)
# Update text widget with API response
text_widget.insert(tk.END, text)
text_widget.config(state='disabled') # Make the text widget read-only
# Text variable to hold the conversation history for follow-up questions
conversation_text = text
# Text Entry for follow-up questions
entry_frame = tk.Frame(root)
entry_var = tk.StringVar()
entry_widget = tk.Entry(entry_frame, textvariable=entry_var, width=50)
entry_widget.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
entry_frame.pack(fill=tk.BOTH)
# Button to send follow-up questions
def send_followup(event=None):
nonlocal conversation_text
followup_text = entry_var.get()
if followup_text.strip() == '':
return # Do nothing if text is empty
followup_message = [
{
"role": "assistant",
"content": conversation_text
},
{
"role": "user",
"content": followup_text
}
]
base64_image = encode_image(image_path)
response = get_image_description(base64_image, followup_message)
entry_var.set('') # Clear the entry widget
# Append the follow-up response to the text widget
text_widget.config(state='normal')
new_response_text = response['choices'][0]['message']['content'] # The received response
conversation_text += "\n\n" + new_response_text # Update the conversation history
text_widget.insert(tk.END, "\n\n" + new_response_text)
text_widget.config(state='disabled') # Make the text widget read-only
entry_widget.bind("<Return>", send_followup)
send_button = tk.Button(entry_frame, text="Send", command=send_followup)
send_button.pack(side=tk.RIGHT)
root.geometry(f'+{window_x}+{window_y}')
root.mainloop()
window_thread = threading.Thread(target=run_window)
window_thread.start()
# Updated function to take a screenshot with coordinates
def take_screenshot():
global start_x, start_y, end_x, end_y
screenshot_path = "screenshot.png"
subprocess.run(["gnome-screenshot", "-f", screenshot_path, "-a",
f"{start_x},{start_y},{end_x - start_x},{end_y - start_y}"])
print("Screenshot taken!")
time.sleep(1)
base64_image = encode_image(screenshot_path)
description = get_image_description(base64_image)
print('description: ', description)
display_result(screenshot_path, description['choices'][0]['message']['content'])
# Hotkey function
def hotkey_function():
print("Hotkey pressed! Drag to select screenshot area...")
take_screenshot()
# Hotkey listener thread
def hotkey_listener():
while True:
keyboard.wait('ctrl+alt+s')
hotkey_function()
# Main function
if __name__ == "__main__":
listener_thread = threading.Thread(target=hotkey_listener, daemon=True)
listener_thread.start()
# Keep the main thread alive
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("Program exiting...")