-
Notifications
You must be signed in to change notification settings - Fork 2
/
fetch_govuk_data.py
122 lines (103 loc) · 3.28 KB
/
fetch_govuk_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import csv
import sys
import ipdb
import json
import time
import argparse
import requests
import requests_cache
from urlparse import urlparse
from BeautifulSoup import BeautifulSoup
from requests_cache import CachedSession
from requests.adapters import HTTPAdapter
session = CachedSession(cache_name='govuk_cache', backend='sqlite')
session.mount('http://', HTTPAdapter())
session.mount('https://', HTTPAdapter())
def test_base_path(original_url):
"""
Given a URL, perform a HEAD request and fetch the actual URL (in case there
are redirects) and return that.
"""
response = session.head(original_url, allow_redirects=True)
if 200 <= response.status_code < 300:
return response.url
elif response.status_code == 429:
response.raise_for_status()
else:
if response.status_code not in (410,):
sys.stderr.write("Unexpected response {} for {}\n".format(response.status_code, original_url))
return None
def fetch_education_urls(input_file):
"""
Given a local input file, parse it and return a list of GOV.UK URLs.
"""
documents = []
with open(input_file, 'r') as f:
reader = csv.reader(f)
# skip headers
next(reader, None)
documents = list(reader)
return [document[0] for document in documents]
def fetch_metadata(url):
"""
Given a GOV.UK URL, this function fetches the page from the content store
and extracts the information about where it's published/rendered and also
the various data formats.
"""
govuk_url = test_base_path(url)
content_store_url = govuk_url.replace(
"https://www.gov.uk/",
"https://www.gov.uk/api/content/"
)
r = session.get(content_store_url, verify=False, timeout=10)
json_data = r.json()
links = json_data.get('links')
taxons = links.get('taxons')
taxon_title = ''
taxon_base_path = ''
taxon_description = ''
if taxons and len(taxons) > 0:
taxon = taxons[0]
taxon_title = taxon.get(u'title')
taxon_base_path = taxon.get(u'base_path')
taxon_description = taxon.get(u'description')
return [
json_data.get('content_id'),
json_data.get('rendering_app') or '',
json_data.get('publishing_app'),
json_data.get('document_type'),
json_data.get('format'),
json_data.get('schema_name'),
taxon_title,
taxon_base_path,
taxon_description
]
if __name__ == "__main__":
input_file = sys.argv[1]
output_file = sys.argv[2]
requests_cache.install_cache()
csvfile = open(output_file, 'wb')
content_writer = csv.writer(csvfile, delimiter=',')
content_writer.writerow(
[
'URL',
'content_id',
'rendering_app',
'publishing_app',
'document_type',
'format',
'schema_name',
'taxon_title',
'taxon_base_path',
'taxon_description'
]
)
urls = fetch_education_urls(input_file)
for idx, url in enumerate(urls):
try:
print("===> Processing URL #" + str(idx) + " - " + url)
data = fetch_metadata(url)
content_writer.writerow([url] + data)
except:
content_writer.writerow([url, ''])