FINAL-ASSIGNMENT/scraper.py at master · sakshi2912/FINAL-ASSIGNMENT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#Collab with Triambaka Naresh

#Scraping Data from multiple html files alsong with every header (Scraped data is stored in Weather.csv)
from bs4 import BeautifulSoup
import urllib.request
import csv

f=open('Weather.csv','w',newline='')
writer=csv.writer(f)
baseurl='https://karki23.github.io/Weather-Data/'
validpages=['Albury.html','BadgerysCreek.html','Cobar.html','CoffsHarbour.html','Moree.html','Newcastle.html','NorahHead.html','NorfolkIsland.html','Penrith.html','Richmond.html','Sydney.html','SydneyAirport.html','WaggaWagga.html','Williamtown.html','Wollongong.html','Canberra.html','Tuggeranong.html','MountGinini.html','Ballarat.html','Bendigo.html','Sale.html','MelbourneAirport.html','Melbourne.html','Mildura.html','Nhil.html','Portland.html','Watsonia.html','Dartmoor.html','Brisbane.html','Cairns.html','GoldCoast.html','Townsville.html','Adelaide.html','MountGambier.html','Nuriootpa.html','Woomera.html','Albany.html','Witchcliffe.html','PearceRAAF.html','PerthAirport.html','Perth.html','SalmonGums.html','Walpole.html','Hobart.html','Launceston.html','AliceSprings.html','Darwin.html','Katherine.html','Uluru.html']


for text in validpages:
            html =f'{baseurl}{text}'

            soup= BeautifulSoup(urllib.request.urlopen(html).read(), 'lxml')

            tbody=soup('table')[0].find_all('tr')
            for row in tbody:
                cols=row.findChildren(recursive=False)
                cols=[ele.text.strip() for ele in cols]
                writer.writerow(cols)
#                print(cols)

#Shuffling the dataset in order to include many citites in the sample (Being stored in shuffle.csv)
from random import shuffle

with open('Weather.csv') as ip:
    lines=ip.readlines()
    header = lines.pop(0)
    shuffle(lines)
    lines.insert(0, header)

with open('shuffled.csv','w') as out:
    out.writelines(lines)

#Removing Duplicates , in this case Header has been included 49 times , which has to be present only once (Storing the data in Weather.csv)
from more_itertools import unique_everseen
with open('shuffled.csv','r') as f, open('Weather.csv','w') as out_file:
    out_file.writelines(unique_everseen(f))