Merge branch 'v1.X'

# Conflicts: # sdmxthon/utils/xml_base.py # setup.py
Meaningful-Data · Dec 22, 2022 · dc4465b · dc4465b
2 parents d6c187b + 7690508
commit dc4465b
Show file tree

Hide file tree

Showing 17 changed files with 222 additions and 1,002 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,9 @@
 *.py[cod]
 *$py.class
 dist
-sdmxthon/docs/_build
-sdmxthon/outputTests/
+/sdmxthon/docs/_build
+/sdmxthon/outputTests/
+Pipfile.lock
+/SDMXthon.egg-info/
+/testApi.py
+/xml_parsers_test.py
diff --git a/Changelog.rst b/Changelog.rst
@@ -2,6 +2,38 @@
 Changelog
 #########
 
+1.3 (2022-31-05)
+----------------
+**Added**
+
+**Changes**
+ - Implemented better understanding of inFile in read_xml.
+ - Adapted to_vtl_json() to new format.
+
+**Bugfixes**
+
+1.2 (2021-01-12)
+-----------------
+
+**Added**
+ - Implemented several formats on validFrom/validTo, as shown on issue #17
+
+**Changes**
+ - Redesigned reading process based on xmltodict
+ - Implemented custom writing process based on generators. Reduced memory footprint and improved performance and maintainability. Implemented Generic Series writing process.
+ - Improved overall performance on semantic validation.
+ - Cleanup of old parsers and writing methods. Simplified code for better maintainability.
+ - Model changes:
+    - Deleted 'dataset' on data retrieval
+    - Changed keys of message.content on Metadata Type.
+**Bugfixes**
+
+1.1 (2021-01-12)
+----------------
+
+Development version (Yanked Release), changes are implemented in 1.2.
+
+
 1.0.3 (2021-09-30)
 ------------------
 
@@ -13,7 +45,6 @@ Changelog
 
 - Fixed bug on Dataflow with constraints parsing.
 
-=======
 1.0.2 (2021-07-06)
 ------------------
 

diff --git a/Pipfile b/Pipfile
@@ -14,12 +14,12 @@ sphinx-autobuild = "*"
 sphinx-rtd-theme = "*"
 
 [packages]
-lxml = { version = "4.6.1", index = "pypi" }
-pandas = { version = "1.2.0", index = "pypi" }
+lxml = "==4.6.0"
+pandas = "==1.4.2"
 numpy = "==1.19.3"
-validators = "*"
-requests = "*"
-xmltodict = "*"
+validators = "==0.19.0"
+requests = "==2.27.1"
+xmltodict = "==0.13.0"
 
 [requires]
 python_version = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/sdmxthon/api/api.py b/sdmxthon/api/api.py
@@ -4,7 +4,7 @@
 from sdmxthon.model.message import Message
 from sdmxthon.parsers.read import read_xml
 from sdmxthon.utils.enums import MessageTypeEnum
-from sdmxthon.utils.handlers import first_element_dict
+from sdmxthon.utils.handlers import first_element_dict, drop_na_all
 
 
 def read_sdmx(sdmx_file, validate=True) -> Message:
@@ -27,7 +27,8 @@ def read_sdmx(sdmx_file, validate=True) -> Message:
     return Message(type_, data)
 
 
-def get_datasets(data, path_to_metadata, validate=True):
+def get_datasets(data, path_to_metadata, validate=True,
+                 remove_empty_columns=True):
     """
     GetDatasets performs the operation of reading a SDMX Data and SDMX
     metadata files. URLs could be used.
@@ -38,6 +39,7 @@ def get_datasets(data, path_to_metadata, validate=True):
 
     :param validate: Validation of the XML file against the XSD (default: True)
 
+    :param remove_empty_columns: Removes empty columns on output pd.Dataframe
 
     :return: A :obj:`Dataset <model.dataSet.DataSet>` object or a dict of \
     :obj:`Datasets <model.dataSet.DataSet>`
@@ -55,28 +57,36 @@ def get_datasets(data, path_to_metadata, validate=True):
         elif v in metadata['Dataflows']:
             datasets[v].dataflow = metadata['Dataflows'][v]
 
+        if remove_empty_columns:
+            datasets[v].data = drop_na_all(datasets[v].data)
+
     if len(datasets) == 1:
         return first_element_dict(datasets)
 
     return datasets
 
 
-def get_pandas_df(data, validate=True):
+def get_pandas_df(data, validate=True, remove_empty_columns=True):
     """
     GetPandasDF reads all observations in a SDMX file as Pandas Dataframe(s)
 
     :param data: Path, URL or SDMX data file as string
 
     :param validate: Validation of the XML file against the XSD (default: True)
+    :param remove_empty_columns: Removes empty columns on output pd.Dataframe
 
     :return: A dict of `Pandas Dataframe \
     <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_
     """
     datasets = read_xml(data, "Data", validate=validate)
-    return {ds: datasets[ds].data for ds in datasets}
+    if not remove_empty_columns:
+        return {ds: datasets[ds].data for ds in datasets}
+    else:
+        return {ds: drop_na_all(datasets[ds].data) for ds in datasets}
 
 
-def xml_to_csv(data, output_path=None, validate=True, **kwargs):
+def xml_to_csv(data, output_path=None, validate=True,
+               remove_empty_columns=True, **kwargs):
     """
     XML to CSV transforms a SDMX file into a CSV. Saves the file on disk or
     .zip of CSV. If the SDMX data file has only a Dataset and output_path is
@@ -85,10 +95,18 @@ def xml_to_csv(data, output_path=None, validate=True, **kwargs):
     :param data: Path, URL or SDMX file as string (Data file)
     :param output_path: Path to save the CSV (default: None)
     :param validate: Validation of the XML file against the XSD (default: True)
+    :param remove_empty_columns: Removes empty columns on output pd.Dataframe
+    :param kwargs: Kwargs for `to_csv \
+    <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html>`_
+
     :return: A StringIO object if output_path is ''
     """
     datasets = read_xml(data, mode="Data", validate=validate)
 
+    if remove_empty_columns:
+        for ds in datasets:
+            datasets[ds].data = drop_na_all(datasets[ds].data)
+
     if output_path is not None and '.zip' in output_path:
         with ZipFile(output_path, 'w') as zipObj:
             # Add multiple files to the zip

diff --git a/sdmxthon/docs/conf.py b/sdmxthon/docs/conf.py
@@ -17,10 +17,10 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'SDMXThon'
-copyright = '2021, MeaningfulData'
+project = 'sdmxthon'
+copyright = '2022, MeaningfulData'
 author = 'MeaningfulData'
-version = '0.1'
+version = '1.3'
 
 # -- General configuration ---------------------------------------------------
 
@@ -56,7 +56,7 @@
     'analytics_id': '',
     'analytics_anonymize_ip': True,
     'logo_only': False,
-    'display_version': False,
+    'display_version': True,
     'prev_next_buttons_location': '',
     'style_external_links': True,
     'vcs_pageview_mode': 'blob',

diff --git a/sdmxthon/docs/walkthrough.rst b/sdmxthon/docs/walkthrough.rst
@@ -62,7 +62,7 @@ The information in the payload can be obtained by using the payload class attrib
 
 .. code-block:: python
 
-     >>> sdmx_message.payload.dsds
+     >>> sdmx_message.payload['DataStructures']
 
     {'ECB:ECB_CBD2(1.0)': <DataStructureDefinition  - ECB:ECB_CBD2(1.0)>}
 
@@ -75,7 +75,7 @@ the concrete objects:
     >>> sdmx_message.content
 
     {
-      "codelists": {
+      "Codelists": {
         "ECB:CL_ACTIVITY(1.0)": "<Codelist - ECB:CL_ACTIVITY(1.0)>",
         "ECB:CL_AREA(1.0)": "<Codelist - ECB:CL_AREA(1.0)>",
         "ECB:CL_CB_EXP_TYPE(1.0)": "<Codelist - ECB:CL_CB_EXP_TYPE(1.0)>",
@@ -98,13 +98,13 @@ the concrete objects:
         "ECB:CL_UNIT(1.0)": "<Codelist - ECB:CL_UNIT(1.0)>",
         "ECB:CL_UNIT_MULT(1.0)": "<Codelist - ECB:CL_UNIT_MULT(1.0)>"
       },
-      "concepts": {
+      "Concepts": {
         "ECB:ECB_CONCEPTS(1.0)": "<ConceptScheme - ECB:ECB_CONCEPTS(1.0)>"
       },
-      "dsds": {
+      "DataStructures": {
         "ECB:ECB_CBD2(1.0)": "<DataStructureDefinition  - ECB:ECB_CBD2(1.0)>"
       },
-      "organisations": "<AgencyScheme - SDMX:AGENCIES(1.0)>"
+      "OrganisationSchemes": "<AgencyScheme - SDMX:AGENCIES(1.0)>"
     }
 
 The input to the read_sdmx method can be a file or an URL. An example with a URL:
@@ -133,13 +133,13 @@ Datasets objects:
      >>> sdmx_data_message = sdmxthon.read_sdmx('http://ec.europa.eu/eurostat/SDMX/diss-web/rest/data/nama_10_gdp/.CLV10_MEUR.B1GQ.BE/?startperiod=2005&endPeriod=2011')
      >>> sdmx_data_message.content
 
-     {'datasets': {'ESTAT_DSD_nama_10_gdp_1_0': <DataSet - No Structure found>}}
+     {'ESTAT_DSD_nama_10_gdp_1_0': <DataSet - No Structure found>}
 
 A Dataset object has a series of SDMX-specific attributes (see reference for complete list). The data in the dataset are stored as a Pandas Dataframe, in the *data* attribute:
 
 .. code-block:: python
 
-     >>> sdmx_data_message.content['datasets']['ESTAT_DSD_nama_10_gdp_1_0'].data
+     >>> sdmx_data_message.content['ESTAT_DSD_nama_10_gdp_1_0'].data
 
       NA_ITEM        UNIT GEO FREQ TIME_PERIOD OBS_VALUE
     0    B1GQ  CLV10_MEUR  BE    A        2011  369293.6
@@ -234,7 +234,7 @@ Scheme we can access to the inner metadata classes.
 
 .. code-block:: python
 
-    >>> concept_scheme = message.content['concepts']["ECB:ECB_CONCEPTS(1.0)"]
+    >>> concept_scheme = message.content['Concepts']["ECB:ECB_CONCEPTS(1.0)"]
     >>> concept_scheme.items
 
     {
@@ -248,7 +248,7 @@ Regarding the DataStructureDefinition, we can access in a similar way:
 
 .. code-block:: python
 
-    >>> dsd = message.content['dsds']['ECB:ECB_CBD2(1.0)']
+    >>> dsd = message.content['DataStructures']['ECB:ECB_CBD2(1.0)']
     >>> dsd.content
 
     {'dimensions': {

diff --git a/sdmxthon/model/dataset.py b/sdmxthon/model/dataset.py
@@ -222,8 +222,7 @@ def data(self, value):
                         if len(temp) > 0:
                             attached_attributes[e] = temp.loc[0, e]
                         del temp[e]
-
-            self._data = temp.dropna(axis=1, how="all")
+            self._data = temp
             if len(attached_attributes) > 0:
                 for k, v in attached_attributes.items():
                     self.attached_attributes[k] = str(v)

diff --git a/sdmxthon/model/definitions.py b/sdmxthon/model/definitions.py
@@ -615,6 +615,10 @@ def to_vtl_json(self, path: str = None):
         """Formats the DataStructureDefinition as a VTL DataStructure"""
         dataset_name = self.id
         components = []
+        NAME = "name"
+        ROLE = "role"
+        TYPE = "type"
+        NULLABLE = "nullable"
         for c in self.dimension_descriptor.components.values():
 
             type_ = "String"
@@ -623,8 +627,8 @@ def to_vtl_json(self, path: str = None):
                     c.representation.type_ is not None):
                 type_ = c.representation.type_
 
-            component = {"name": c.id, "role": "Identifier",
-                         "type": Data_Types_VTL[type_], "isNull": False}
+            component = {NAME: c.id, ROLE: "Identifier",
+                         TYPE: Data_Types_VTL[type_], NULLABLE: False}
 
             components.append(component)
         if self.attribute_descriptor is not None:
@@ -635,8 +639,8 @@ def to_vtl_json(self, path: str = None):
                         c.representation.type_ is not None):
                     type_ = c.representation.type_
 
-                component = {"name": c.id, "role": "Attribute",
-                             "type": Data_Types_VTL[type_], "isNull": True}
+                component = {NAME: c.id, ROLE: "Attribute",
+                             TYPE: Data_Types_VTL[type_], NULLABLE: True}
 
                 components.append(component)
         for c in self.measure_descriptor.components.values():
@@ -646,16 +650,16 @@ def to_vtl_json(self, path: str = None):
                     c.representation.type_ is not None):
                 type_ = c.representation.type_
 
-            component = {"name": c.id, "role": "Measure",
-                         "type": Data_Types_VTL[type_], "isNull": True}
+            component = {NAME: c.id, ROLE: "Measure",
+                         TYPE: Data_Types_VTL[type_], NULLABLE: True}
 
             components.append(component)
 
-        result = {
-            "DataSet": {"name": dataset_name, "DataStructure": components}}
+        result = {"datasets": [{"name": dataset_name,
+                                "DataStructure": components}]}
         if path is not None:
             with open(path, 'w') as fp:
-                fp.write(json.dumps(result))
+                json.dump(result, fp)
         else:
             return result
 

diff --git a/sdmxthon/parsers/data_read.py b/sdmxthon/parsers/data_read.py
@@ -55,7 +55,10 @@ def reading_generic_series(dataset) -> pd.DataFrame:
         for data in series[OBS]:
             obs = dict()
             obs[OBS_DIM] = data[OBS_DIM][VALUE.lower()]
-            obs[OBSVALUE.upper()] = data[OBSVALUE][VALUE.lower()]
+            if OBSVALUE in data:
+                obs[OBSVALUE.upper()] = data[OBSVALUE][VALUE.lower()]
+            else:
+                obs[OBSVALUE.upper()] = None
             if ATTRIBUTES in data:
                 obs = {**obs, **get_element_to_list(data, mode=ATTRIBUTES)}
             test_list.append({**keys, **obs})

diff --git a/sdmxthon/parsers/read.py b/sdmxthon/parsers/read.py
@@ -32,12 +32,19 @@ def parse_sdmx(result):
         return create_metadata(result[STRUCTURE][STRUCTURES])
     else:
         message = result[global_mode]
-        if isinstance(message[DATASET], list):
+        dataset_key = None
+        for key in message:
+            if DATASET in key:
+                dataset_key = key
+        if dataset_key is None:
+            raise Exception('Cannot parse datasets on this file')
+
+        if isinstance(message[dataset_key], list):
             structures = {}
             # Relationship between structures and structure id
             for structure in message[HEADER][STRUCTURE]:
                 structures[structure[STRID]] = structure
-            for single_dataset in message[DATASET]:
+            for single_dataset in message[dataset_key]:
                 str_ref = single_dataset[STRREF]
                 if SERIES in single_dataset:
                     metadata = get_dataset_metadata(structures[str_ref],
@@ -51,23 +58,23 @@ def parse_sdmx(result):
                                     global_mode)
                 datasets[metadata[STRID]] = ds
         else:
-            if SERIES in message[DATASET]:
+            if SERIES in message[dataset_key]:
                 metadata = get_dataset_metadata(message[HEADER][STRUCTURE],
-                                                message[DATASET][STRREF],
+                                                message[dataset_key][STRREF],
                                                 mode=SERIES)
-            elif OBS in message[DATASET]:
+            elif OBS in message[dataset_key]:
                 metadata = get_dataset_metadata(message[HEADER][STRUCTURE],
-                                                message[DATASET][STRREF],
+                                                message[dataset_key][STRREF],
                                                 mode=OBS)
             else:
                 if message[HEADER][STRUCTURE][DIM_OBS] == "AllDimensions":
                     mode = OBS
                 else:
                     mode = SERIES
                 metadata = get_dataset_metadata(message[HEADER][STRUCTURE],
-                                                message[DATASET][STRREF],
+                                                message[dataset_key][STRREF],
                                                 mode=mode)
-            ds = create_dataset(message[DATASET], metadata, global_mode)
+            ds = create_dataset(message[dataset_key], metadata, global_mode)
             datasets[metadata[STRID]] = ds
 
     return datasets