-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathidentify_freq_schema.py
64 lines (54 loc) · 2.21 KB
/
identify_freq_schema.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import xml.etree.ElementTree as ET
import os
# where to load or save
SCHEMA_DIR = 'OECD_schema'
DATA_DIR = 'OECD_keys'
KEY_NAMES_FILE = os.path.join(DATA_DIR, 'OECD_key_names.csv')
DATA_FILE = os.path.join(DATA_DIR, 'FREQ_key_names.csv')
# performance metrics
dataset_files_cnt = 0
has_datasettype_node_cnt = 0
# data to be collected
usable_datasets = []
frequency_keywords = []
# Load a list of data set ids
dataset_ids_df = pd.read_csv(KEY_NAMES_FILE)
dataset_ids = dataset_ids_df['KeyFamilyId'].tolist()
# go through each data set schema file and see if it
# support the FREQUENCY or FREQ dimension for observations
for dataset_id in dataset_ids:
try:
tree = ET.parse(os.path.join(SCHEMA_DIR, dataset_id + '.xml'))
except FileNotFoundError:
pass
else:
dataset_files_cnt += 1
root = tree.getroot()
childIndex = 0
for rootChild in root:
rootChildAttrib = rootChild.attrib
if 'name' in rootChildAttrib:
attribName = rootChildAttrib['name']
if attribName == 'DataSetType':
# print(dataset_id, 'has DataSetType')
has_datasettype_node_cnt += 1
dstNode = root[childIndex][0][0]
for dstChild in dstNode:
dstChildAttrib = dstChild.attrib
if 'name' in dstChildAttrib:
dimension = dstChildAttrib['name']
# print(val2)
if dimension == 'FREQUENCY' or dimension == 'FREQ':
usable_datasets.append(dataset_id)
frequency_keywords.append(dimension)
print(dataset_id, 'pandasdmx usable with', dimension)
childIndex += 1
if len(usable_datasets):
usableDF = pd.DataFrame({'KeyFamilyId': usable_datasets, 'Dimension': frequency_keywords})
usableDF.set_index('KeyFamilyId', inplace=True)
usableDF.to_csv(DATA_FILE)
print()
print('completed ...')
print('Out of', dataset_files_cnt, 'data set files,', len(usable_datasets), 'are usable by pandasdmx.')
print(has_datasettype_node_cnt, 'have DataSetType nodes')