-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.py
executable file
·63 lines (44 loc) · 1.61 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python
import os.path
import re
import sys
import time
import requests
SEARCH_ROOT_URL = 'http://www.tylerpolice.com/PIR/user/DoSearch.aspx'
MOST_RECENT_URL = 'http://www.tylerpolice.com/PIR/user/SearchResults.aspx?searchType=1'
MEDIA_REPORT_URL = 'http://www.tylerpolice.com/PIR/user/ViewReport.aspx?id=%s'
REPORT_URL_REGEX = re.compile('ViewReport\.aspx\?id=([\d]{9})')
root = requests.get(SEARCH_ROOT_URL)
if root.status_code != 200:
sys.exit('Root request failed.')
session_cookies = root.cookies
recent = requests.get(MOST_RECENT_URL, cookies=session_cookies)
if recent.status_code != 200:
sys.exit('Recent request failed.')
recent_urls = REPORT_URL_REGEX.search(recent.content)
newest_id = recent_urls.group(1)
year = int(newest_id[:2])
report_id = int(newest_id[2:])
while report_id >= 0:
next_id = '%i%07i' % (year, report_id)
if os.path.exists('page_cache/%s.html' % next_id):
report_id -= 1
continue
if os.path.exists('page_cache/%s.skip' % next_id):
report_id -= 1
continue
print 'Fetching report %s' % next_id
report = requests.get(MEDIA_REPORT_URL % next_id, cookies=session_cookies)
if report.status_code != 200:
print 'Request failed'
continue
if report.url == MEDIA_REPORT_URL % next_id:
print 'Saving report'
with open('page_cache/%s.html' % next_id, 'w') as f:
f.write(report.content)
elif report.url == SEARCH_ROOT_URL:
print 'No report'
with open('page_cache/%s.skip' % next_id, 'w') as f:
f.write('NO REPORT')
report_id -= 1
time.sleep(1)