ES-DOC · asladeofgreen · Dec 17, 2020 · Dec 17, 2020 · Dec 17, 2020 · Dec 18, 2020
diff --git a/cdf2cim/__init__.py b/cdf2cim/__init__.py
@@ -26,7 +26,7 @@
 __date__ = "2019-01-18"
 __license__ = "GPL/CeCILL-2.1"
 __title__ = "cdf2cim"
-__version__ = "1.0.2.0"
+__version__ = "1.1.0"
 
 
 

diff --git a/cdf2cim/constants.py b/cdf2cim/constants.py
@@ -61,6 +61,16 @@
     MIP_ERA_CMIP6
     }
 
+# Set of fields to exclude from hash derivation.
+NON_HASH_FIELDS = (
+   'contact',
+   'references',
+   'forcing',
+   'variant_info',
+   "filenames",
+   "dataset_versions",
+)
+
 # Supported project codes.
 CMIP5 = 'CMIP5'
 CMIP6 = 'CMIP6'

diff --git a/cdf2cim/encoder.py b/cdf2cim/encoder.py
@@ -0,0 +1,44 @@
+"""
+.. module:: encoder.py
+   :license: GPL/CeCIL
+   :platform: Unix, Windows
+   :synopsis: Encodes cdf2cim data prior to I/O operation.
+
+.. moduleauthor:: David Hassell <[email protected]>
+
+
+"""
+import collections
+
+import numpy
+
+from cdf2cim import hashifier
+from cdf2cim.constants import NON_HASH_FIELDS
+
+
+
+def encode(obj: dict) -> collections.OrderedDict:
+    """Encodes output from map/reduce as a JSON safe dictionary.
+
+    :param dict obj: Output from a map/reduce job.
+
+    :returns: A JSON safe dictionary
+
+    """
+    def _encode(key, value):
+        """Encodes a value.
+
+        """
+        if isinstance(value, numpy.float64):
+            return float(value)
+        if isinstance(value, numpy.int32):
+            return int(value)
+        if key.endswith("_index"):
+            return int(value)
+        return value
+
+    result = collections.OrderedDict()
+    for k in sorted(obj.keys()):
+        result[k] = _encode(k, obj[k])
+
+    return result
diff --git a/cdf2cim/hashifier.py b/cdf2cim/hashifier.py
@@ -8,20 +8,21 @@
 
 
 """
+import collections
 import json
 import hashlib
 
+from cdf2cim.constants import NON_HASH_FIELDS
 
 
-def hashify(metadata):
-    """Returns hashes dervied from a cdf2cim metadata blob.
+def hashify(metadata: collections.OrderedDict) -> str:
+    """Returns hashes derived from a cdf2cim metadata blob.
 
     :param dict metadata: Simulation metadata.
 
     """
-    metadata_as_text = json.dumps(metadata)
-    hash_id = hashlib.md5(metadata_as_text.encode('utf-8')).hexdigest()
-    hash_id = f"{hash_id}{metadata['start_time']}{metadata['end_time']}"
-    hash_id = hashlib.md5(hash_id.encode('utf-8')).hexdigest()
+    target = metadata.copy()
+    for field in NON_HASH_FIELDS:
+        target.pop(field, None)
 
-    return hash_id
+    return hashlib.md5(json.dumps(target).encode('utf-8')).hexdigest()
diff --git a/cdf2cim/io_manager.py b/cdf2cim/io_manager.py
@@ -17,6 +17,7 @@
 import numpy
 
 from cdf2cim import exceptions
+from cdf2cim import encoder
 from cdf2cim import hashifier
 from cdf2cim import logger
 from cdf2cim.constants import FILE_STATUS_PUBLISHED
@@ -27,34 +28,6 @@
 
 
 
-def encode(obj):
-    """Encodes an output from a map/reduce as a JSON safe dictionary.
-
-    :param dict obj: Output from a map/reduce job.
-
-    :returns: A JSON safe dictionary
-    :rtype: dict
-
-    """
-    def _encode(key, value):
-        """Encodes a value.
-
-        """
-        if isinstance(value, numpy.float64):
-            return float(value)
-        if isinstance(value, numpy.int32):
-            return int(value)
-        if key.endswith("_index"):
-            return int(value)
-        return value
-
-    result = collections.OrderedDict()
-    for k in sorted(obj.keys()):
-        result[k] = _encode(k, obj[k])
-
-    return result
-
-
 def yield_files(criteria):
     """Yields files implied by the criteria.
 
@@ -133,7 +106,7 @@ def dump(obj, overwrite):
 
     """
     # Set metadata (a JSON serializable ordered dictionary).
-    metadata = encode(obj)
+    metadata = encoder.encode(obj)
 
     # Set hash id.
     metadata['_hash_id'] = hashifier.hashify(metadata)

diff --git a/cdf2cim/mapper.py b/cdf2cim/mapper.py
@@ -62,23 +62,6 @@ def execute(identifier, properties, dates):
         if v:
             cim2_properties[prop] = ', '.join(sorted(v))
 
-    # Include all items from extra3 from all files, omitting
-    # duplicates, as a list
-    extra3 = {
-        'dataset_versions': [],
-        'filenames'       : [],
-    }
-
-    for p in properties:
-        for x, v in extra3.items():
-            v.append(p.get(x))
-
-    for prop, v in extra3.items():
-        v = set(v)
-        v.discard(None)
-        if v:
-            cim2_properties[prop] = tuple(sorted(v))
-
     # ------------------------------------------------------------
     # The cim2_properties dictionary now contains everything
     # needed to create CIM2 Enemble, Ensemble Member and

diff --git a/cdf2cim/parser.py b/cdf2cim/parser.py
@@ -93,9 +93,8 @@ def parse(cf_field):
     # Add the dataset version to the cim2 properties. It is assumed
     # that the file path of the file is
     # /a/load/of/DRS/stuff/<VERSION>/filename.nc
-    cim2_properties['dataset_versions'] = cf_field.fpath.split('/')[-2]
-
-    cim2_properties['filenames'] = cf_field.fpath
+    # cim2_properties['dataset_versions'] = cf_field.fpath.split('/')[-2]
+    # cim2_properties['filenames'] = cf_field.fpath
 
     # Add the time coordinates' calendar to the cim2 properties
     try:

diff --git a/tests/sample-output/cmip5.json b/tests/sample-output/cmip5.json
@@ -2,9 +2,6 @@
     "branch_time_in_parent": 52560.0, 
     "calendar": "360_day", 
     "contact": "[email protected], [email protected]", 
-    "dataset_versions": [
-        "cmip5"
-    ], 
     "end_time": "2010-12-01 00:00:00", 
     "experiment_id": "rcp85", 
     "forcing": "GHG, SA, Oz, LU, Sl, Vl, BC, OC, (GHG = CO2, N2O, CH4, CFCs)", 
@@ -22,4 +19,4 @@
     "source_id": "HadGEM2-ES", 
     "start_time": "2005-12-01 00:00:00", 
     "_hash_id": "0552ec5f015718532ae73613e053346a"
-}
+}
diff --git a/tests/sample-output/cmip6.json b/tests/sample-output/cmip6.json
@@ -4,9 +4,6 @@
     "branch_time_in_parent": "1809-11-12 00:00:00", 
     "calendar": "360_day", 
     "contact": "Python Coder ([email protected]) ", 
-    "dataset_versions": [
-        "v1"
-    ], 
     "end_time": "2006-06-01 00:00:00", 
     "experiment_id": "piControl", 
     "forcing_index": 1, 
@@ -26,4 +23,4 @@
     "sub_experiment_id": "none", 
     "variant_info": "forcing: black carbon aerosol only", 
     "_hash_id": "4306ffb5d9abf74f02ab65709c51184b"
-}
+}
diff --git a/tests/test-data/cmip6/v1/tas_0.nc b/tests/test-data/cmip6/v1/tas_0.nc
diff --git a/tests/test_find.py b/tests/test_find.py
@@ -64,6 +64,7 @@ def _assert_simulation(obj, expected_fields):
     """
     assert isinstance(obj, dict)
     assert obj['mip_era'] in constants.MIP_ERA
-    assert cdf2cim.io_manager.encode(obj)
+#    assert cdf2cim.io_manager.encode(obj)
+    assert cdf2cim.encoder.encode(obj)
     for key in [i for i in expected_fields if not i.startswith('_')]:
         assert key in obj, (key, expected_fields)
diff --git a/tests/test_hash.py b/tests/test_hash.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+"""
+.. module:: test_hash.py
+
+   :license: GPL / CeCILL
+   :platform: Unix, Windows
+   :synopsis: Executes hashifier unit tests.
+
+.. moduleauthor:: Earth System Documentation (ES-DOC) <[email protected]>
+
+"""
+import inspect
+import json
+import os
+import tempfile
+
+import cf
+
+import cdf2cim
+from utils import *
+
+
+def test_is_function():
+    """ES-DOC :: cdf2cim :: hashifier :: cdf2cim.hashify function is supported.
+
+    """
+    assert inspect.isfunction(cdf2cim.hashifier.hashify)
+
+
+def test_equal_hash_id():
+    """ES-DOC :: cdf2cim :: scan
+
+    """
+    filename = os.path.join(CMIP6_NETCDF_DIR, 'tas_0.nc')
+    tmpfile = tempfile.mkstemp('_test_hash.nc', dir=os.getcwd())[1]
+
+    # Create a temporary file that has different, but non-hashable
+    # properties
+    f = cf.read(filename, verbose=1)[0]
+    for attr in cdf2cim.constants.NON_HASH_FIELDS:
+        f.set_property(attr, 'DIFFERENT VALUE '+tmpfile)
+
+    cf.write(f, tmpfile)
+
+    blob = set()
+    for f in (filename, tmpfile):
+        for x in cdf2cim.scan(f):
+            try:
+                blob.add(x[0])
+            except IndexError:
+                pass
+            else:
+                break
+
+    os.remove(tmpfile)
+
+    # Test that both files produced the same blob, and therefore have
+    # the same hash
+    assert len(blob) == 1
diff --git a/tests/test_io_encode.py b/tests/test_io_encode.py
@@ -29,7 +29,8 @@ def test_encode():
 
     """
     for obj in cdf2cim.find(NETCDF_DIR):
-        assert isinstance(cdf2cim.io_manager.encode(obj), dict)
+#        assert isinstance(cdf2cim.io_manager.encode(obj), dict)
+        assert isinstance(cdf2cim.encoder.encode(obj), dict)
 
 
 def test_json_conversion_failure():
@@ -46,4 +47,5 @@ def test_convert_to_json():
 
     """
     for obj in cdf2cim.find(NETCDF_DIR):
-        assert json.dumps(cdf2cim.io_manager.encode(obj))
+#        assert json.dumps(cdf2cim.io_manager.encode(obj))
+        assert json.dumps(cdf2cim.encoder.encode(obj))