-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
159 lines (121 loc) · 5.76 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
from urllib.request import urlopen
import mysql.connector
mydb = mysql.connector.connect(host="localhost",user="USERNAME",passwd="PASSWORD",database="DATABASE")
mycursor = mydb.cursor(buffered=True)
def get_product(page):
# specify the url
quote_page = page
# query the website and return the html to the variable page
page = urlopen(quote_page)
# parse the html using beautiful soup and store in variable soup
soup = BeautifulSoup(page, "html.parser")
# Take out the <div> of name and get its value
#url = soup.find("link", itemprop="url")
site_name = soup.find("meta", property="og:site_name")
url = soup.find("meta", property="og:url")
title = soup.find("meta", property="og:title")
currency = soup.find("meta", property="og:price:currency")
image = soup.find("meta", property="og:image")
image_secure = image = soup.find("meta", property="og:image:secure_url")
gtin = soup.find("meta", itemprop="gtin13")
brand = soup.find("meta", itemprop="name")
description = soup.find("meta", itemprop="description")
category = soup.find("meta", itemprop="category")
category = category["content"]
category = category.strip() # strip() is used to remove starting and trailing
#url = soup.find("meta", itemprop="mainEntityOfPage url")
#availability = soup.find("meta", itemprop="availability")
#availability = (availability["content"]).replace("https://schema.org/", "")
for available in soup.find_all('div', attrs={'id':'availability'}):
#print availability .find('span')
availability = available .find('title')
availability = availability.text
availability = availability.strip() # strip() is used to remove starting and trailing
#print availability
priceCurrency = soup.find("meta", itemprop="priceCurrency")
sku = soup.find("meta", itemprop="sku")
price = soup.find("meta", itemprop="price")
#prices = price.text.strip() # strip() is used to remove starting and trailing
price2 = soup.find("meta", property="og:price:amount")
print(url["content"] if url else "No meta title given")
print(title["content"] if url else "No meta title given")
print(availability if availability else "No meta title given")
print(priceCurrency["content"] if priceCurrency else "No meta title given")
#print(price["content"] if price else "No meta title given")
print(price2["content"] if price else "No meta title given")
print(sku["content"] if price else "No meta title given")
print(image["content"] if price else "No meta title given")
print(brand["content"] if price else "No meta title given")
print(description["content"] if price else "No meta title given")
print(category if category else "No meta title given")
# NEW SECTION
vUrl = (url["content"] if url else "No meta title given")
vTitle = (title["content"] if url else "No meta title given")
print(vUrl)
print(vTitle)
vAvailability = (availability if availability else "No meta title given")
print(vAvailability)
vPriceCurrency = (priceCurrency["content"] if priceCurrency else "No meta title given")
vPrice = (price2["content"] if price else "No meta title given")
vSKU = (sku["content"] if price else "No meta title given")
vImage = (image["content"] if price else "No meta title given")
vBrand = (brand["content"] if price else "No meta title given")
vDescription = (description["content"] if price else "No meta title given")
vCategory = (category if category else "No meta title given")
sql = "INSERT INTO stage_jbhifi(SKU, Title, Brand, Availability, PriceCurrency, Price, URL, ImageURL, Description, Category) VALUES (%s, %s, %s,%s, %s, %s, %s, %s, %s,%s ) ON DUPLICATE KEY UPDATE Title=%s,Brand=%s,Availability=%s,PriceCurrency=%s,Price=%s,URL= %s,ImageURL=%s,Description = %s,Category = %s"
#sql = "INSERT INTO stage_jbhifi(SKU, Title, Brand, Availability, PriceCurrency, Price, URL, ImageURL, Description, Category) VALUES (1,1,1,1,1,1,1,1,1,1)"
val = (vSKU, vTitle, vBrand, vAvailability,vPriceCurrency,vPrice,vUrl,vImage,vDescription,vCategory, vTitle, vBrand, vAvailability,vPriceCurrency,vPrice,vUrl,vImage,vDescription,vCategory)
print("Status Updated")
mycursor.execute(sql,val)
mydb.commit()
# get the index price
price_box = soup.find("div", attrs={"class":"price"})
price = price_box
def get_sitemap(url):
get_url = requests.get(url)
if get_url.status_code == 200:
return get_url.text
else:
print('Unable to fetch sitemap: %s.' % url)
def process_sitemap(s):
soup = BeautifulSoup(s, 'lxml')
result = []
for loc in soup.findAll('loc'):
result.append(loc.text)
return result
def is_sub_sitemap(url):
parts = urlparse(url)
if parts.path.endswith('.xml') and 'sitemap' in parts.path:
return True
else:
return False
def parse_sitemap(s):
sitemap = process_sitemap(s)
result = []
while sitemap:
candidate = sitemap.pop()
if is_sub_sitemap(candidate):
sub_sitemap = get_sitemap(candidate)
for i in process_sitemap(sub_sitemap):
sitemap.append(i)
else:
result.append(candidate)
return result
def main():
sitemap = get_sitemap('https://www.jbhifi.com.au/sitemap.xml')
url_count = 0
for url in parse_sitemap(sitemap):
#url_count += 1
if "https://www.jbhifi.com.au/products/" in url:
url_count += 1
print("%5d) %s" % (url_count, url))
try:
get_product(url)
except Exception:
pass
print("-end-of-list-")
if __name__ == '__main__':
main()