-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcovid-19-vic-au-get-dep-ed.py
57 lines (46 loc) · 1.55 KB
/
covid-19-vic-au-get-dep-ed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import datetime
from io import StringIO
import os
import pandas
import requests
from bs4 import BeautifulSoup
def processWebPage(webpageURL, datadir):
html_page = requests.get(webpageURL).text
html_page = str.replace(str.replace(html_page , ' ' , ' '), '\n', ' ' )
soup = BeautifulSoup(html_page, 'html.parser')
text = soup.find_all(text=True)
output_text = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
# there may be more elements you don't want, such as "style", etc.
]
for t in text:
if t.parent.name not in blacklist and str.strip(t) > '':
if t.parent.name in { 'li' , 'p' }:
output_text += '\nli: {}\n'.format(str.strip(t))
else:
output_text += '{}'.format(str.strip(t))
print(output_text)
output_IO = StringIO(output_text)
output_DF = pandas.read_csv(output_IO, sep='\|\|')
print (output_DF)
output_DF.to_excel (datadir + "Dep Ed Closures.xlsx", index=False)
def main():
"""
Main - program execute
"""
print (str(datetime.datetime.now()) + ' Starting ...')
webpageURL = 'https://www.education.vic.gov.au/about/programs/health/pages/closures.aspx'
datadir = 'C:/Dev/covid-19-vic-au/'
processWebPage(webpageURL, datadir)
print (str(datetime.datetime.now()) + ' Finished!')
exit()
if __name__ == '__main__':
main()