-
Notifications
You must be signed in to change notification settings - Fork 3
/
scraper.py
166 lines (125 loc) · 7.36 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import csv
import time
import tkinter as tk
from tkinter import filedialog, simpledialog
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
# Function to get the current username
def get_username():
return os.getlogin()
# Function to get the search query from the user using a dialog box
def get_search_query():
root = tk.Tk()
root.withdraw() # Hide the main tkinter window
search_query = simpledialog.askstring("WhatsApp Search", "Enter your search query:")
return search_query
# Function to extract the title attribute
def extract_title_attribute():
try:
# Get the current username
username = get_username()
# Prompt the user to select the Chrome driver executable
executable_path = filedialog.askopenfilename(title="Select Chrome WebDriver executable", filetypes=[("Executable Files", "*.exe")])
# Check if the user provided path is valid
if not os.path.isfile(executable_path):
raise FileNotFoundError(f"Invalid Chrome WebDriver executable path: {executable_path}")
# Specify the user data directory where the Chrome profile with an active WhatsApp Web session is located
user_data_directory = fr'C:\Users\{username}\AppData\Local\Google\Chrome\User Data'
# Initialize the Chrome webdriver with the existing WebDriver executable using the Service class
options = Options()
options.add_argument(f'--user-data-dir={user_data_directory}')
# options.add_argument('--headless')
# options.add_argument('--disable-gpu')
# options.add_argument("remote-debugging-port=3333")
# options.add_argument( "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.5790.102 Safari/537.36")
service = Service(executable_path=executable_path)
driver = webdriver.Chrome(service=service, options=options)
# Open WhatsApp Web URL
driver.get('https://web.whatsapp.com/')
# Wait for the user to scan the QR code and open WhatsApp Web session
WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#pane-side')))
time.sleep(2) # Wait for a while to interact with the page
# Get the search query from the user using a dialog box
search_query = get_search_query()
# Find the search input element and input the search query
search_input_element = driver.find_element(By.CSS_SELECTOR, 'div[title="Search input textbox"]')
search_input_element.click()
search_input_element.send_keys(search_query)
time.sleep(5)
# Find all elements with 'span' tag and 'title' attribute
chat_elements = driver.find_elements(By.XPATH, '//div[@role="row"]//div[@data-testid="cell-frame-container"]//div[@class="y_sn4"]//span[@title]')
# Target string to search for at the beginning of the title
target_starting_string = search_query.lower()
# Variable to store the elements with the target title
target_elements = []
# Use list comprehensions to filter the target elements based on the search query
target_elements = [chat_element for chat_element in chat_elements
if target_starting_string in chat_element.get_attribute("title").lower()]
# Iterate through each 'span' element and check the title
# for chat_element in chat_elements:
# title = chat_element.get_attribute("title").lower()
# if target_starting_string in title:
# target_elements.append(chat_element)
# Check if the target elements were found and print additional information
if target_elements:
print(f"{len(target_elements)} Target elements found!")
for target_element in target_elements:
print("Title: ", target_element.get_attribute("title"))
print("Text: ", target_element.text)
print("Text: ", target_element)
# Loop through all the target elements and click on them one by one
for index, target_element in enumerate(target_elements):
try:
print("Title: ", target_element.get_attribute("title"))
print("Text: ", target_element.text)
target_element.click()
except NoSuchElementException:
print("error occur here")
print(target_elements[index-1])
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#main > header > div._2au8k > div.p357zi0d.r15c9g6i.g4oj0cdv.ovllcyds.l0vqccxk.pm5hny62 > span')))
# Wait for the contact page to load
try:
# Find the element containing the contact names using CSS Selector
contact_element = driver.find_element(By.CSS_SELECTOR, '#main > header > div._2au8k > div.p357zi0d.r15c9g6i.g4oj0cdv.ovllcyds.l0vqccxk.pm5hny62 > span')
if "last seen" in contact_element.text:
print("Contact element contains 'last seen' pattern. Skipping this target element.")
print("=========================================")
continue # Skip this iteration and continue with the next target element
while("click here" in contact_element.text):
pass
# Extract and print the contact names
contact_names = contact_element.get_attribute('title')
# print("Contact Names:", contact_names)
print("=========================================")
# Split the comma-separated data into a list
contact_names = contact_names.split(', ')
filename = f"Contact_List_{index + 1}_{target_element.get_attribute('title')}.csv"
# Save the extracted title to a CSV file
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Title'])
for item in contact_names:
writer.writerow([item])
print(f'Contact list saved to "{filename}"')
print("=========================================")
except NoSuchElementException:
print("Contact element not found. Skipping this target element.")
print("=========================================")
continue # Skip this iteration and continue with the next target element
else:
print("No target elements found.")
# Close the browser (Optional: Comment this line if you want to keep the WhatsApp Web session active)
time.sleep(5)
driver.quit()
except Exception as e:
print("An error occurred:", e)
if __name__ == "__main__":
# Run the function
extract_title_attribute()
# Stop the virtual display