forked from planningalerts-scrapers/yarra
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.rb
68 lines (58 loc) · 2.27 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
require 'scraperwiki'
require 'mechanize'
url = "http://www.yarracity.vic.gov.au/Planning-Application-Search/Results.aspx?ApplicationNumber=&Suburb=(All)&Street=(All)&Status=Current&Ward=(All)"
def clean_whitespace(a)
a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
end
def get_page_data(page)
comment_url = "http://www.yarracity.vic.gov.au/planning--building/Planning-applications/Objecting-to-a-planning-applicationVCAT/"
trs = page.search('table#ContentPlaceHolder_dgResults/tr')
trs[1..-2].each do |tr|
texts = tr.search('td').map{|n| n.inner_text}
council_reference = clean_whitespace(texts[0])
info_url = "http://www.yarracity.vic.gov.au/Planning-Application-Search/Results.aspx?ApplicationNumber=#{council_reference}&Suburb=(All)&Street=(All)&Status=(All)&Ward=(All)"
record = {
'info_url' => info_url,
'comment_url' => comment_url,
'council_reference' => council_reference,
'date_received' => Date.parse(texts[1]).to_s,
'address' => clean_whitespace(texts[2]),
'description' => clean_whitespace(texts[3]),
'date_scraped' => Date.today.to_s
}
begin
record["on_notice_from"] = Date.parse(texts[4]).to_s
rescue
# In case the date is invalid
end
if ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty?
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
end
end
end
agent = Mechanize.new
page = agent.get(url)
current_page = 1
begin
get_page_data(page)
# Click on the link to the next page
links = page.search('table tr')[-1].search('a')
link = links.find{|a| a.inner_text.to_i == current_page + 1}
# This page has a really odd paging mechanism
if link.nil?
# Ignore the first link in case it's a "..." as well that will go back rather than forward
link = links[1..-1].find{|a| a.inner_text == "..."}
end
if link
href = link["href"]
matches = href.match(/javascript:__doPostBack\('(.*)','(.*)'\)/)
# We're faking what the __doPostBack javascript does
form = page.forms.first
form["__EVENTTARGET"] = matches[1]
form["__EVENTARGUMENT"] = matches[2]
page = form.submit
current_page += 1
end
end while link