-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathdcat.py
157 lines (124 loc) · 5.77 KB
/
dcat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import copy
from dateutil import parser
from urllib.parse import urlparse
from urllib.parse import parse_qs
from bs4 import BeautifulSoup
try:
from processor import Processor
except:
from .processor import Processor
class ProcessorDCAT(Processor):
def __init__(self):
super().__init__(type="dcat")
def get_datasets(self, owner, start_url, fname):
print(start_url)
datasets_collection = processor.get_json(start_url)
if datasets_collection != "NULL":
datasets_collection = datasets_collection["dcat:dataset"]
print(f"Found {len(datasets_collection)} datasets")
prepped = []
for dataset in datasets_collection:
# Get keywords
keywords = dataset.get("dcat:keyword", [])
# If there's only one keyword (e.g. the property returned a string, then stick it in an array)
if type(keywords) is str:
keywords = [keywords]
# Get common metadata
title = dataset.get("dct:title", "")
owner = dataset.get("dct:publisher", "").get("foaf:name", "")
original_dataset_link = dataset.get("@id", "")
date_created = parser.parse(dataset.get("dct:issued", "")).date()
try:
date_modified = parser.parse(dataset.get("dct:modified", "")).date()
except (ValueError, TypeError):
date_modified = None
tags = ";".join(map(str, keywords))
metadata_url = dataset.get("dct:identifier", "")
license = get_license(metadata_url) # TODO
description = dataset.get("dct:description", "").strip("\u200b")
dataset_resources = dataset.get("dcat:distribution")
for resource in dataset_resources:
resource_link = resource.get("dcat:accessURL", {}).get("@id", "")
file_name = resource.get("dct:description", "")
file_type = resource.get("dct:title", "")
prepped_resource = [
title,
owner,
original_dataset_link,
resource_link,
file_name,
date_created,
date_modified,
"", # TODO: size
"", # TODO: size unit
file_type,
"", # TODO: numrecords,
tags,
"", # Manual tags
license,
description,
]
prepped.append(prepped_resource)
print(f"{len(prepped)} lines for csv")
processor.write_csv(fname, prepped)
def get_license(metadata_url):
parsed_url = urlparse(metadata_url)
dataset_guid = parse_qs(parsed_url.query)["id"][0]
license_metadata_url = (
f"https://www.arcgis.com/sharing/rest/content/items/{dataset_guid}?f=json"
)
license_metadata = processor.get_json(license_metadata_url)
return parse_license(license_metadata.get("licenseInfo", ""))
# TODO: This probably needs refactored as part of a main license parsing function
def parse_license(license_info):
if license_info in (None, ""):
return ""
# Strip HTML
soup = BeautifulSoup(license_info)
stripped_license_info = (
soup.get_text().replace("\t", " ").replace("\r", " ").replace("\n", " ").replace('\xa0', ' ')
)
if stripped_license_info in (None, ""):
return ""
if any(
s.lower() in stripped_license_info.lower()
for s in (
"Data is being released under Open Government Licence terms",
"supplied under the Open Government Licence",
"supplied under the Open Government License",
"supplied under the Open Government License",
"This dataset is available for use under the Open Government Licence",
"This dataset is available under the terms of the UK Open Government Licence",
"http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3",
"made available under the Open Government Licence",
"you do have to adhere to the terms of the Open Government Licence",
"We, Stirling Council, publish our mapping datasets under the Open Government Licence",
"publish our mapping datasets under the Open Government Licence",
"We use the Open Government Licence",
"licensed under the Open Government Licence",
"covered by Open Government Licence",
"See Licence at https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/",
"See Open Government Licence at https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/",
)
):
return "OGL3"
if (
stripped_license_info.lower().startswith("open government licence")
or stripped_license_info.lower().startswith("open government license")
or stripped_license_info.lower().startswith("uk open government licence")
or stripped_license_info.lower().startswith("uk open government license")
):
return "OGL3"
if (
"http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3"
in stripped_license_info.lower()
):
return "OGL3"
if stripped_license_info == "CC-BY-SA":
return "https://creativecommons.org/licenses/by-sa/3.0/"
# TODO: Log unknown licenses as warnings
print(f"UNKNOWN LICENSE: {stripped_license_info}")
return ""
processor = ProcessorDCAT()
if __name__ == "__main__":
processor.process()