Skip to content

Commit

Permalink
Merge branch 'v1.X'
Browse files Browse the repository at this point in the history
# Conflicts:
#	sdmxthon/utils/xml_base.py
#	setup.py
  • Loading branch information
javihern98 committed Dec 22, 2022
2 parents d6c187b + 7690508 commit dc4465b
Show file tree
Hide file tree
Showing 17 changed files with 222 additions and 1,002 deletions.
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,9 @@
*.py[cod]
*$py.class
dist
sdmxthon/docs/_build
sdmxthon/outputTests/
/sdmxthon/docs/_build
/sdmxthon/outputTests/
Pipfile.lock
/SDMXthon.egg-info/
/testApi.py
/xml_parsers_test.py
33 changes: 32 additions & 1 deletion Changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,38 @@
Changelog
#########

1.3 (2022-31-05)
----------------
**Added**

**Changes**
- Implemented better understanding of inFile in read_xml.
- Adapted to_vtl_json() to new format.

**Bugfixes**

1.2 (2021-01-12)
-----------------

**Added**
- Implemented several formats on validFrom/validTo, as shown on issue #17

**Changes**
- Redesigned reading process based on xmltodict
- Implemented custom writing process based on generators. Reduced memory footprint and improved performance and maintainability. Implemented Generic Series writing process.
- Improved overall performance on semantic validation.
- Cleanup of old parsers and writing methods. Simplified code for better maintainability.
- Model changes:
- Deleted 'dataset' on data retrieval
- Changed keys of message.content on Metadata Type.
**Bugfixes**

1.1 (2021-01-12)
----------------

Development version (Yanked Release), changes are implemented in 1.2.


1.0.3 (2021-09-30)
------------------

Expand All @@ -13,7 +45,6 @@ Changelog

- Fixed bug on Dataflow with constraints parsing.

=======
1.0.2 (2021-07-06)
------------------

Expand Down
10 changes: 5 additions & 5 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ sphinx-autobuild = "*"
sphinx-rtd-theme = "*"

[packages]
lxml = { version = "4.6.1", index = "pypi" }
pandas = { version = "1.2.0", index = "pypi" }
lxml = "==4.6.0"
pandas = "==1.4.2"
numpy = "==1.19.3"
validators = "*"
requests = "*"
xmltodict = "*"
validators = "==0.19.0"
requests = "==2.27.1"
xmltodict = "==0.13.0"

[requires]
python_version = "*"
800 changes: 0 additions & 800 deletions Pipfile.lock

This file was deleted.

28 changes: 23 additions & 5 deletions sdmxthon/api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from sdmxthon.model.message import Message
from sdmxthon.parsers.read import read_xml
from sdmxthon.utils.enums import MessageTypeEnum
from sdmxthon.utils.handlers import first_element_dict
from sdmxthon.utils.handlers import first_element_dict, drop_na_all


def read_sdmx(sdmx_file, validate=True) -> Message:
Expand All @@ -27,7 +27,8 @@ def read_sdmx(sdmx_file, validate=True) -> Message:
return Message(type_, data)


def get_datasets(data, path_to_metadata, validate=True):
def get_datasets(data, path_to_metadata, validate=True,
remove_empty_columns=True):
"""
GetDatasets performs the operation of reading a SDMX Data and SDMX
metadata files. URLs could be used.
Expand All @@ -38,6 +39,7 @@ def get_datasets(data, path_to_metadata, validate=True):
:param validate: Validation of the XML file against the XSD (default: True)
:param remove_empty_columns: Removes empty columns on output pd.Dataframe
:return: A :obj:`Dataset <model.dataSet.DataSet>` object or a dict of \
:obj:`Datasets <model.dataSet.DataSet>`
Expand All @@ -55,28 +57,36 @@ def get_datasets(data, path_to_metadata, validate=True):
elif v in metadata['Dataflows']:
datasets[v].dataflow = metadata['Dataflows'][v]

if remove_empty_columns:
datasets[v].data = drop_na_all(datasets[v].data)

if len(datasets) == 1:
return first_element_dict(datasets)

return datasets


def get_pandas_df(data, validate=True):
def get_pandas_df(data, validate=True, remove_empty_columns=True):
"""
GetPandasDF reads all observations in a SDMX file as Pandas Dataframe(s)
:param data: Path, URL or SDMX data file as string
:param validate: Validation of the XML file against the XSD (default: True)
:param remove_empty_columns: Removes empty columns on output pd.Dataframe
:return: A dict of `Pandas Dataframe \
<https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_
"""
datasets = read_xml(data, "Data", validate=validate)
return {ds: datasets[ds].data for ds in datasets}
if not remove_empty_columns:
return {ds: datasets[ds].data for ds in datasets}
else:
return {ds: drop_na_all(datasets[ds].data) for ds in datasets}


def xml_to_csv(data, output_path=None, validate=True, **kwargs):
def xml_to_csv(data, output_path=None, validate=True,
remove_empty_columns=True, **kwargs):
"""
XML to CSV transforms a SDMX file into a CSV. Saves the file on disk or
.zip of CSV. If the SDMX data file has only a Dataset and output_path is
Expand All @@ -85,10 +95,18 @@ def xml_to_csv(data, output_path=None, validate=True, **kwargs):
:param data: Path, URL or SDMX file as string (Data file)
:param output_path: Path to save the CSV (default: None)
:param validate: Validation of the XML file against the XSD (default: True)
:param remove_empty_columns: Removes empty columns on output pd.Dataframe
:param kwargs: Kwargs for `to_csv \
<https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html>`_
:return: A StringIO object if output_path is ''
"""
datasets = read_xml(data, mode="Data", validate=validate)

if remove_empty_columns:
for ds in datasets:
datasets[ds].data = drop_na_all(datasets[ds].data)

if output_path is not None and '.zip' in output_path:
with ZipFile(output_path, 'w') as zipObj:
# Add multiple files to the zip
Expand Down
8 changes: 4 additions & 4 deletions sdmxthon/docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@

# -- Project information -----------------------------------------------------

project = 'SDMXThon'
copyright = '2021, MeaningfulData'
project = 'sdmxthon'
copyright = '2022, MeaningfulData'
author = 'MeaningfulData'
version = '0.1'
version = '1.3'

# -- General configuration ---------------------------------------------------

Expand Down Expand Up @@ -56,7 +56,7 @@
'analytics_id': '',
'analytics_anonymize_ip': True,
'logo_only': False,
'display_version': False,
'display_version': True,
'prev_next_buttons_location': '',
'style_external_links': True,
'vcs_pageview_mode': 'blob',
Expand Down
18 changes: 9 additions & 9 deletions sdmxthon/docs/walkthrough.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ The information in the payload can be obtained by using the payload class attrib

.. code-block:: python
>>> sdmx_message.payload.dsds
>>> sdmx_message.payload['DataStructures']
{'ECB:ECB_CBD2(1.0)': <DataStructureDefinition - ECB:ECB_CBD2(1.0)>}
Expand All @@ -75,7 +75,7 @@ the concrete objects:
>>> sdmx_message.content
{
"codelists": {
"Codelists": {
"ECB:CL_ACTIVITY(1.0)": "<Codelist - ECB:CL_ACTIVITY(1.0)>",
"ECB:CL_AREA(1.0)": "<Codelist - ECB:CL_AREA(1.0)>",
"ECB:CL_CB_EXP_TYPE(1.0)": "<Codelist - ECB:CL_CB_EXP_TYPE(1.0)>",
Expand All @@ -98,13 +98,13 @@ the concrete objects:
"ECB:CL_UNIT(1.0)": "<Codelist - ECB:CL_UNIT(1.0)>",
"ECB:CL_UNIT_MULT(1.0)": "<Codelist - ECB:CL_UNIT_MULT(1.0)>"
},
"concepts": {
"Concepts": {
"ECB:ECB_CONCEPTS(1.0)": "<ConceptScheme - ECB:ECB_CONCEPTS(1.0)>"
},
"dsds": {
"DataStructures": {
"ECB:ECB_CBD2(1.0)": "<DataStructureDefinition - ECB:ECB_CBD2(1.0)>"
},
"organisations": "<AgencyScheme - SDMX:AGENCIES(1.0)>"
"OrganisationSchemes": "<AgencyScheme - SDMX:AGENCIES(1.0)>"
}
The input to the read_sdmx method can be a file or an URL. An example with a URL:
Expand Down Expand Up @@ -133,13 +133,13 @@ Datasets objects:
>>> sdmx_data_message = sdmxthon.read_sdmx('http://ec.europa.eu/eurostat/SDMX/diss-web/rest/data/nama_10_gdp/.CLV10_MEUR.B1GQ.BE/?startperiod=2005&endPeriod=2011')
>>> sdmx_data_message.content
{'datasets': {'ESTAT_DSD_nama_10_gdp_1_0': <DataSet - No Structure found>}}
{'ESTAT_DSD_nama_10_gdp_1_0': <DataSet - No Structure found>}
A Dataset object has a series of SDMX-specific attributes (see reference for complete list). The data in the dataset are stored as a Pandas Dataframe, in the *data* attribute:

.. code-block:: python
>>> sdmx_data_message.content['datasets']['ESTAT_DSD_nama_10_gdp_1_0'].data
>>> sdmx_data_message.content['ESTAT_DSD_nama_10_gdp_1_0'].data
NA_ITEM UNIT GEO FREQ TIME_PERIOD OBS_VALUE
0 B1GQ CLV10_MEUR BE A 2011 369293.6
Expand Down Expand Up @@ -234,7 +234,7 @@ Scheme we can access to the inner metadata classes.

.. code-block:: python
>>> concept_scheme = message.content['concepts']["ECB:ECB_CONCEPTS(1.0)"]
>>> concept_scheme = message.content['Concepts']["ECB:ECB_CONCEPTS(1.0)"]
>>> concept_scheme.items
{
Expand All @@ -248,7 +248,7 @@ Regarding the DataStructureDefinition, we can access in a similar way:

.. code-block:: python
>>> dsd = message.content['dsds']['ECB:ECB_CBD2(1.0)']
>>> dsd = message.content['DataStructures']['ECB:ECB_CBD2(1.0)']
>>> dsd.content
{'dimensions': {
Expand Down
3 changes: 1 addition & 2 deletions sdmxthon/model/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,7 @@ def data(self, value):
if len(temp) > 0:
attached_attributes[e] = temp.loc[0, e]
del temp[e]

self._data = temp.dropna(axis=1, how="all")
self._data = temp
if len(attached_attributes) > 0:
for k, v in attached_attributes.items():
self.attached_attributes[k] = str(v)
Expand Down
22 changes: 13 additions & 9 deletions sdmxthon/model/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,10 @@ def to_vtl_json(self, path: str = None):
"""Formats the DataStructureDefinition as a VTL DataStructure"""
dataset_name = self.id
components = []
NAME = "name"
ROLE = "role"
TYPE = "type"
NULLABLE = "nullable"
for c in self.dimension_descriptor.components.values():

type_ = "String"
Expand All @@ -623,8 +627,8 @@ def to_vtl_json(self, path: str = None):
c.representation.type_ is not None):
type_ = c.representation.type_

component = {"name": c.id, "role": "Identifier",
"type": Data_Types_VTL[type_], "isNull": False}
component = {NAME: c.id, ROLE: "Identifier",
TYPE: Data_Types_VTL[type_], NULLABLE: False}

components.append(component)
if self.attribute_descriptor is not None:
Expand All @@ -635,8 +639,8 @@ def to_vtl_json(self, path: str = None):
c.representation.type_ is not None):
type_ = c.representation.type_

component = {"name": c.id, "role": "Attribute",
"type": Data_Types_VTL[type_], "isNull": True}
component = {NAME: c.id, ROLE: "Attribute",
TYPE: Data_Types_VTL[type_], NULLABLE: True}

components.append(component)
for c in self.measure_descriptor.components.values():
Expand All @@ -646,16 +650,16 @@ def to_vtl_json(self, path: str = None):
c.representation.type_ is not None):
type_ = c.representation.type_

component = {"name": c.id, "role": "Measure",
"type": Data_Types_VTL[type_], "isNull": True}
component = {NAME: c.id, ROLE: "Measure",
TYPE: Data_Types_VTL[type_], NULLABLE: True}

components.append(component)

result = {
"DataSet": {"name": dataset_name, "DataStructure": components}}
result = {"datasets": [{"name": dataset_name,
"DataStructure": components}]}
if path is not None:
with open(path, 'w') as fp:
fp.write(json.dumps(result))
json.dump(result, fp)
else:
return result

Expand Down
5 changes: 4 additions & 1 deletion sdmxthon/parsers/data_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def reading_generic_series(dataset) -> pd.DataFrame:
for data in series[OBS]:
obs = dict()
obs[OBS_DIM] = data[OBS_DIM][VALUE.lower()]
obs[OBSVALUE.upper()] = data[OBSVALUE][VALUE.lower()]
if OBSVALUE in data:
obs[OBSVALUE.upper()] = data[OBSVALUE][VALUE.lower()]
else:
obs[OBSVALUE.upper()] = None
if ATTRIBUTES in data:
obs = {**obs, **get_element_to_list(data, mode=ATTRIBUTES)}
test_list.append({**keys, **obs})
Expand Down
23 changes: 15 additions & 8 deletions sdmxthon/parsers/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,19 @@ def parse_sdmx(result):
return create_metadata(result[STRUCTURE][STRUCTURES])
else:
message = result[global_mode]
if isinstance(message[DATASET], list):
dataset_key = None
for key in message:
if DATASET in key:
dataset_key = key
if dataset_key is None:
raise Exception('Cannot parse datasets on this file')

if isinstance(message[dataset_key], list):
structures = {}
# Relationship between structures and structure id
for structure in message[HEADER][STRUCTURE]:
structures[structure[STRID]] = structure
for single_dataset in message[DATASET]:
for single_dataset in message[dataset_key]:
str_ref = single_dataset[STRREF]
if SERIES in single_dataset:
metadata = get_dataset_metadata(structures[str_ref],
Expand All @@ -51,23 +58,23 @@ def parse_sdmx(result):
global_mode)
datasets[metadata[STRID]] = ds
else:
if SERIES in message[DATASET]:
if SERIES in message[dataset_key]:
metadata = get_dataset_metadata(message[HEADER][STRUCTURE],
message[DATASET][STRREF],
message[dataset_key][STRREF],
mode=SERIES)
elif OBS in message[DATASET]:
elif OBS in message[dataset_key]:
metadata = get_dataset_metadata(message[HEADER][STRUCTURE],
message[DATASET][STRREF],
message[dataset_key][STRREF],
mode=OBS)
else:
if message[HEADER][STRUCTURE][DIM_OBS] == "AllDimensions":
mode = OBS
else:
mode = SERIES
metadata = get_dataset_metadata(message[HEADER][STRUCTURE],
message[DATASET][STRREF],
message[dataset_key][STRREF],
mode=mode)
ds = create_dataset(message[DATASET], metadata, global_mode)
ds = create_dataset(message[dataset_key], metadata, global_mode)
datasets[metadata[STRID]] = ds

return datasets
Expand Down
Loading

0 comments on commit dc4465b

Please sign in to comment.