-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawl-hdx.py
98 lines (83 loc) · 3.34 KB
/
crawl-hdx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""Crawl HXL to build a glossary of tagspecs and headers
CKAN API documentation: http://docs.ckan.org/en/latest/api/
Python CKAN library: https://github.com/ckan/ckanapi
Started by David Megginson, 2018-05-09
"""
import ckanapi, hxl, logging, time, sys, csv
# Set up a logger
logging.basicConfig(stream=sys.stderr, level=logging.WARNING)
logger = logging.getLogger(__name__)
DELAY = 2
"""Time delay in seconds between datasets, to give HDX a break."""
CHUNK_SIZE=100
"""Number of datasets to read at once"""
CKAN_URL = 'https://data.humdata.org'
"""Base URL for the CKAN instance."""
USER_AGENT='HDXINTERNAL HXL hashtag analysis'
"""User agent (for analytics)"""
# Open a connection to HDX
ckan = ckanapi.RemoteCKAN(CKAN_URL, user_agent=USER_AGENT)
# Open a CSV output stream
output = csv.writer(sys.stdout)
# Iterate through all the datasets ("packages") and resources on HDX
start = 0
result_count = 999999 # just a big, big number; will reset on first search result
output.writerow([
'Hashtag',
'Hashtag with attributes',
'Text header',
'Locations',
'Data provider',
'HDX dataset id',
'HDX resource id',
'Date created',
'Hash',
'Quick Charts',
])
output.writerow([
'#meta+tag',
'#meta+tagspec',
'#meta+header',
'#country+code+list',
'#org+provider',
'#meta+dataset',
'#meta+resource',
'#date+created',
'#meta+hash',
'#meta+has_quickcharts',
])
while start < result_count:
result = ckan.action.package_search(fq='vocab_Topics:hxl', start=start, rows=CHUNK_SIZE)
result_count = result['count']
for package in result['results']:
package_id = package['name']
org_id = package['organization']['name']
location_ids = ' '.join([group['id'] for group in package['groups']])
date_created = package['metadata_created'][:10]
input_options = hxl.input.InputOptions(http_headers={'User-Agent': USER_AGENT})
for resource in package['resources']:
try:
with hxl.data(resource['url'], input_options) as source:
# assumption is that two datasets with exactly the same hashtags+attributes
# in exactly the same order are probably programmatic/API-based variants of
# the same source data
column_hash = hash(tuple([column.display_tag for column in source.columns]))
for i, column in enumerate(source.columns):
if column.tag:
output.writerow([
column.tag,
column.get_display_tag(sort_attributes=True),
column.header,
location_ids,
org_id,
package_id,
resource.get('id'),
date_created,
hex(abs(column_hash)),
'true' if package['has_quickcharts'] else 'false',
])
except Exception as e:
logger.warning("Failed to parse resource %s in dataset %s as HXL (%s): %s", resource['id'], package['name'], str(e), resource['url'])
time.sleep(DELAY) # give HDX a short rest
start += CHUNK_SIZE # next chunk
# end