-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
41 lines (31 loc) · 2.13 KB
/
scraper.py
File metadata and controls
41 lines (31 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#Collab with Triambaka Naresh
#Scraping Data from multiple html files alsong with every header (Scraped data is stored in Weather.csv)
from bs4 import BeautifulSoup
import urllib.request
import csv
f=open('Weather.csv','w',newline='')
writer=csv.writer(f)
baseurl='https://karki23.github.io/Weather-Data/'
validpages=['Albury.html','BadgerysCreek.html','Cobar.html','CoffsHarbour.html','Moree.html','Newcastle.html','NorahHead.html','NorfolkIsland.html','Penrith.html','Richmond.html','Sydney.html','SydneyAirport.html','WaggaWagga.html','Williamtown.html','Wollongong.html','Canberra.html','Tuggeranong.html','MountGinini.html','Ballarat.html','Bendigo.html','Sale.html','MelbourneAirport.html','Melbourne.html','Mildura.html','Nhil.html','Portland.html','Watsonia.html','Dartmoor.html','Brisbane.html','Cairns.html','GoldCoast.html','Townsville.html','Adelaide.html','MountGambier.html','Nuriootpa.html','Woomera.html','Albany.html','Witchcliffe.html','PearceRAAF.html','PerthAirport.html','Perth.html','SalmonGums.html','Walpole.html','Hobart.html','Launceston.html','AliceSprings.html','Darwin.html','Katherine.html','Uluru.html']
for text in validpages:
html =f'{baseurl}{text}'
soup= BeautifulSoup(urllib.request.urlopen(html).read(), 'lxml')
tbody=soup('table')[0].find_all('tr')
for row in tbody:
cols=row.findChildren(recursive=False)
cols=[ele.text.strip() for ele in cols]
writer.writerow(cols)
# print(cols)
#Shuffling the dataset in order to include many citites in the sample (Being stored in shuffle.csv)
from random import shuffle
with open('Weather.csv') as ip:
lines=ip.readlines()
header = lines.pop(0)
shuffle(lines)
lines.insert(0, header)
with open('shuffled.csv','w') as out:
out.writelines(lines)
#Removing Duplicates , in this case Header has been included 49 times , which has to be present only once (Storing the data in Weather.csv)
from more_itertools import unique_everseen
with open('shuffled.csv','r') as f, open('Weather.csv','w') as out_file:
out_file.writelines(unique_everseen(f))