-
Notifications
You must be signed in to change notification settings - Fork 33
/
render_datasets.py
executable file
·123 lines (96 loc) · 3.62 KB
/
render_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
"""Render dataset list to a given format.
Example usage
-------------
$ ./render_datasets.py mir-datasets.yaml datasets.md
$ ./render_datasets.py mir-datasets.yaml datasets.js
"""
import argparse
import markdown
import joblib
import json
import os
import requests
import sys
import yaml
MARKDOWN_TEMPLATE = '''
status| dataset | metadata | contents | with audio
--- | --- | --- | --- | ---
'''
MARKDOWN_RECORD = ('{status} | '
'<a title="{key}" href="{url}" target="_blank" rel="noopener">{title}</a> '
'| {metadata} | {contents} | {audio}')
HEALTH = {
0: '✅',
1: '☠'
}
def get_url_status(url):
try:
response = requests.get(url, timeout=3, stream=True)
status_code = response.status_code
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as derp:
status_code = 666
return HEALTH[status_code >= 400]
def render_one(key, record):
title = record.pop('title', key)
status = get_url_status(record['url'])
metadata = record.pop('metadata', '')
if isinstance(metadata, list):
fields = []
for item in metadata:
if isinstance(item, dict):
meta = list(item.keys())[0]
item = '[{}]({})'.format(meta, item[meta])
fields.append(item)
metadata = ', '.join(fields)
return MARKDOWN_RECORD.format(key=key, title=title, metadata=metadata,
status=status, **record)
def render(records, output_format, n_jobs=-1, verbose=0):
'''Render a number of records to the given output format.
Parameters
----------
records : dict
Dataset records
output_format : str
One of ['md', 'js']
Returns
-------
data : str
String data to write to file.
'''
records = sorted(records.items(), key=lambda x: x[0].lower())
# Fan out
pool = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)
dfx = joblib.delayed(render_one)
lines = pool(dfx(key, record) for key, record in records)
md = MARKDOWN_TEMPLATE + '\n'.join(lines)
if output_format == 'js':
html = markdown.markdown(md, extensions=['extra', 'smarty'], output_format='html5')
output = "document.write({})".format(json.dumps(html.replace('\n', '')))
elif output_format == 'md':
output = md
else:
raise ValueError("unsupported output format: {}".format(output_format))
return output
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
# Inputs
parser.add_argument("dataset_file",
metavar="dataset_file", type=str,
help="Path to the dataset file.")
parser.add_argument("output_file",
metavar="output_file", type=str,
help="Path to rendered output.")
parser.add_argument("--n_jobs",
metavar="n_jobs", type=int, default=1,
help="Number of CPUs to use for parallelization (-1 for all).")
parser.add_argument("--verbose",
metavar="verbose", type=int, default=0,
help="Verbosity level, 0 for nothing, >1 for something.")
args = parser.parse_args()
dataset = yaml.load(open(args.dataset_file), Loader=yaml.FullLoader)
output_format = os.path.splitext(args.output_file)[-1].strip('.')
with open(args.output_file, 'w') as fp:
fp.write(render(dataset, output_format,
n_jobs=args.n_jobs, verbose=args.verbose))
sys.exit(0 if os.path.exists(args.output_file) else 1)