-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathharvester.py
104 lines (83 loc) · 3.3 KB
/
harvester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import csv, json, requests
from requests.exceptions import JSONDecodeError
INPUT_URLS = 'urls.txt'
OUTPUT_PROJECTS = 'data/projects.csv'
OUTPUT_EVENTS = 'data/events.csv'
MAX_PROJECTS = 100000
def main():
with open(INPUT_URLS, 'r') as f:
urls = f.read().split('\n')
print("Getting ready to harvest %d sites" % len(urls))
eventcols, projectcols = get_datapackage_schema()
print("Data Package loaded, Table Schema ready!")
events = []
projects = []
for u in urls:
if not u.strip(): continue
print("--- Harvesting: %s" % u)
ee, pp = get_events_projects(u)
if ee is not None and pp is not None:
events.extend(ee)
projects.extend(pp)
save_data(eventcols, events, projectcols, projects)
print("Done.")
def get_datapackage_schema():
""" Loads a Data Package containing the target Table Schema """
projcols = eventcols = None
with open('datapackage.json') as f:
for res in json.load(f)['resources']:
if 'events' in res['name']:
eventcols = [ f['name'] for f in res['schema']['fields'] ]
if 'projects' in res['name']:
projcols = [ f['name'] for f in res['schema']['fields'] ]
return eventcols, projcols
def get_events_projects(urlhost):
""" Fetches all the events and projects from a Dribdat URL """
url = 'https://' + urlhost
try:
events_data = requests.get(url + '/api/events.json', timeout=5).json()
except JSONDecodeError:
print("!! invalid JSON data, skipping this server")
return None, None
all_events = events_data['events']
for event in all_events:
event['origin'] = urlhost
print('Collecting data from %d events' % len(all_events))
count_total = 0
project_data = {}
for event in all_events:
url_api = url + "/api/event/%d/projects.json" % event['id']
print('.. event %d (%s)' % (event['id'], event['name']))
try:
proj_data = requests.get(url_api).json()
except JSONDecodeError:
print("!! invalid JSON data, skipping this server")
return None, None
if 'projects' in proj_data:
project_data[event['id']] = proj_data['projects']
count_total = count_total + len(proj_data['projects'])
else:
project_data[event['id']] = []
print('!! no data for event %d' % event['id'])
if count_total > MAX_PROJECTS: break
all_projects = []
for pd in project_data:
for proj in project_data[pd]:
proj['origin'] = urlhost
all_projects.append(proj)
print('Downloaded a total of %d projects' % len(all_projects))
return all_events, all_projects
def save_data(eventcols, events, projectcols, projects):
""" Saves the events and projects to a file """
with open(OUTPUT_EVENTS, "w") as f:
cw = csv.DictWriter(f, eventcols, delimiter=',')
cw.writeheader()
cw.writerows(events)
print("Wrote %s" % OUTPUT_EVENTS)
with open(OUTPUT_PROJECTS, "w") as f:
cw = csv.DictWriter(f, projectcols, delimiter=',')
cw.writeheader()
cw.writerows(projects)
print("Wrote %s" % OUTPUT_PROJECTS)
if __name__ == "__main__":
main()