-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_images.py
78 lines (66 loc) · 2.63 KB
/
scrape_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import requests
from PIL import Image
from io import BytesIO
import os
import time
# Setup Chrome driver using webdriver_manager
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.get("site_link")
# XPaths
image_row_xpath = "xpath"
image_xpath = "xpath"
next_page_button_xpath = "xpath"
# Function to scroll and load more images
def load_more_images():
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(10) # Increased wait time to ensure all images load
# Function to click the 'Next page' button using JavaScript
def click_next_page():
try:
next_page_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, next_page_button_xpath))
)
driver.execute_script("arguments[0].click();", next_page_button)
except Exception as e:
print("Failed to click 'Next page' button: ", e)
# Scrape image URLs
image_urls = set()
page_count = 0
while len(image_urls) < 2000 and page_count < 20: # Limited to 20 pages
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, image_row_xpath)))
image_rows = driver.find_elements(By.XPATH, image_row_xpath)
for row in image_rows:
image_elements = row.find_elements(By.XPATH, image_xpath)
for img in image_elements:
srcset = img.get_attribute('srcset')
if srcset:
img_url = srcset.split(', ')[-1].split(' ')[0] # Get the highest resolution image
image_urls.add(img_url)
if len(image_urls) >= 2000:
break
if len(image_urls) < 2000:
click_next_page()
page_count += 1
time.sleep(10) # Wait for the next page to load
# Save images to local directory
output_dir = "scripted_images"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for i, url in enumerate(image_urls):
try:
response = requests.get(url)
img = Image.open(BytesIO(response.content))
img.save(os.path.join(output_dir, f"image_{i+1}.jpg"))
except Exception as e:
print(f"Failed to save image {i+1}: {e}")
driver.quit()