-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap.py
161 lines (131 loc) · 6.05 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import csv
import logging
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
# Set up logging
logging.basicConfig(
filename='scraping_log.log',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def scrape_amazon_static(url: str, output_file: str) -> None:
"""
Scrapes a given Amazon search page using Requests + BeautifulSoup (static).
Saves product data (name, price, rating) to a CSV file.
:param url: The Amazon search URL to scrape.
:param output_file: The name of the CSV file where results are saved.
"""
# Set custom headers to appear as a "real" browser
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
),
"Accept-Language": "en-US, en;q=0.9"
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raise HTTPError if status != 200
except Exception as e:
logging.error(f"Requests error: {e}")
return
soup = BeautifulSoup(response.text, "html.parser")
# Open CSV in write mode
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["Name", "Price", "Rating"]) # CSV header
# Find product containers
products = soup.find_all("div", {"data-component-type": "s-search-result"})
for product in products:
try:
# Extract product name
name_elem = product.find("h2")
name = name_elem.get_text(strip=True) if name_elem else "N/A"
# Extract price
price_elem = product.find("span", {"class": "a-price-whole"})
price = price_elem.get_text(strip=True) if price_elem else "N/A"
# Extract rating
rating_elem = product.find("span", {"class": "a-icon-alt"})
rating = rating_elem.get_text(strip=True) if rating_elem else "N/A"
writer.writerow([name, price, rating])
except Exception as e:
logging.error(f"Error parsing product: {e}")
logging.info(f"Static scrape completed. Data saved to: {output_file}")
def scrape_amazon_dynamic(url: str, output_file: str, headless: bool = True) -> None:
"""
Scrapes a given Amazon search page using Selenium (dynamic).
Saves product data (name, price, rating) to a CSV file.
This approach can handle JavaScript rendering (e.g., infinite scroll).
:param url: The Amazon search URL to scrape.
:param output_file: The name of the CSV file where results are saved.
:param headless: Whether to run Chrome in headless mode (no browser window).
"""
# Chrome Options for Selenium
chrome_options = Options()
if headless:
chrome_options.add_argument("--headless")
# (Optional) Reduce debug logging
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
try:
# Install/update ChromeDriver automatically
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
driver.get(url)
except Exception as e:
logging.error(f"Selenium error: {e}")
return
# Use explicit wait to ensure product containers are loaded
wait = WebDriverWait(driver, 15) # wait up to 15 seconds
try:
wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[@data-component-type='s-search-result']")))
except Exception as e:
logging.error(f"Timeout waiting for search results: {e}")
driver.quit()
return
# (Optional) Scroll to the bottom to force lazy-load of additional products
# This can help if Amazon loads more products dynamically as you scroll.
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # short pause after scrolling
# Grab all product elements
products = driver.find_elements(By.XPATH, "//div[@data-component-type='s-search-result']")
# Open CSV in write mode
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["Name", "Price", "Rating"])
for product in products:
try:
# Extract product name
name_elem = product.find_element(By.XPATH, ".//h2//span")
name = name_elem.text.strip() if name_elem else "N/A"
# Extract price (whole)
price_elem = product.find_elements(By.XPATH, ".//span[@class='a-price-whole']")
if price_elem:
price = price_elem[0].text.strip()
else:
price = "N/A"
# Extract rating
rating_elem = product.find_elements(By.XPATH, ".//span[@class='a-icon-alt']")
if rating_elem:
rating = rating_elem[0].text.strip()
else:
rating = "N/A"
writer.writerow([name, price, rating])
except Exception as e:
logging.error(f"Error parsing product element: {e}")
driver.quit()
logging.info(f"Dynamic scrape completed. Data saved to: {output_file}")
if __name__ == "__main__":
# Example usage: Searching for "laptop" on Amazon India
amazon_url = "https://www.amazon.in/s?k=laptop"
# 1. Static scraping example
# (May miss data if Amazon heavily uses JS or lazy loads certain elements)
scrape_amazon_static(amazon_url, "amazon_static.csv")
# 2. Dynamic scraping example
# (Handles JS-rendered elements and typically gets more complete data)
scrape_amazon_dynamic(amazon_url, "amazon_dynamic.csv", headless=True)