diff --git a/cdf2cim/__init__.py b/cdf2cim/__init__.py index 38658c5..f32c4a3 100644 --- a/cdf2cim/__init__.py +++ b/cdf2cim/__init__.py @@ -26,7 +26,7 @@ __date__ = "2019-01-18" __license__ = "GPL/CeCILL-2.1" __title__ = "cdf2cim" -__version__ = "1.0.2.0" +__version__ = "1.1.0" diff --git a/cdf2cim/constants.py b/cdf2cim/constants.py index 3e2edd6..d0c58a0 100644 --- a/cdf2cim/constants.py +++ b/cdf2cim/constants.py @@ -61,6 +61,16 @@ MIP_ERA_CMIP6 } +# Set of fields to exclude from hash derivation. +NON_HASH_FIELDS = ( + 'contact', + 'references', + 'forcing', + 'variant_info', + "filenames", + "dataset_versions", +) + # Supported project codes. CMIP5 = 'CMIP5' CMIP6 = 'CMIP6' diff --git a/cdf2cim/encoder.py b/cdf2cim/encoder.py new file mode 100644 index 0000000..7471609 --- /dev/null +++ b/cdf2cim/encoder.py @@ -0,0 +1,44 @@ +""" +.. module:: encoder.py + :license: GPL/CeCIL + :platform: Unix, Windows + :synopsis: Encodes cdf2cim data prior to I/O operation. + +.. moduleauthor:: David Hassell + + +""" +import collections + +import numpy + +from cdf2cim import hashifier +from cdf2cim.constants import NON_HASH_FIELDS + + + +def encode(obj: dict) -> collections.OrderedDict: + """Encodes output from map/reduce as a JSON safe dictionary. + + :param dict obj: Output from a map/reduce job. + + :returns: A JSON safe dictionary + + """ + def _encode(key, value): + """Encodes a value. + + """ + if isinstance(value, numpy.float64): + return float(value) + if isinstance(value, numpy.int32): + return int(value) + if key.endswith("_index"): + return int(value) + return value + + result = collections.OrderedDict() + for k in sorted(obj.keys()): + result[k] = _encode(k, obj[k]) + + return result diff --git a/cdf2cim/hashifier.py b/cdf2cim/hashifier.py index 9d30153..c14483c 100644 --- a/cdf2cim/hashifier.py +++ b/cdf2cim/hashifier.py @@ -8,20 +8,21 @@ """ +import collections import json import hashlib +from cdf2cim.constants import NON_HASH_FIELDS -def hashify(metadata): - """Returns hashes dervied from a cdf2cim metadata blob. +def hashify(metadata: collections.OrderedDict) -> str: + """Returns hashes derived from a cdf2cim metadata blob. :param dict metadata: Simulation metadata. """ - metadata_as_text = json.dumps(metadata) - hash_id = hashlib.md5(metadata_as_text.encode('utf-8')).hexdigest() - hash_id = f"{hash_id}{metadata['start_time']}{metadata['end_time']}" - hash_id = hashlib.md5(hash_id.encode('utf-8')).hexdigest() + target = metadata.copy() + for field in NON_HASH_FIELDS: + target.pop(field, None) - return hash_id + return hashlib.md5(json.dumps(target).encode('utf-8')).hexdigest() diff --git a/cdf2cim/io_manager.py b/cdf2cim/io_manager.py index 5728618..4907809 100644 --- a/cdf2cim/io_manager.py +++ b/cdf2cim/io_manager.py @@ -17,6 +17,7 @@ import numpy from cdf2cim import exceptions +from cdf2cim import encoder from cdf2cim import hashifier from cdf2cim import logger from cdf2cim.constants import FILE_STATUS_PUBLISHED @@ -27,34 +28,6 @@ -def encode(obj): - """Encodes an output from a map/reduce as a JSON safe dictionary. - - :param dict obj: Output from a map/reduce job. - - :returns: A JSON safe dictionary - :rtype: dict - - """ - def _encode(key, value): - """Encodes a value. - - """ - if isinstance(value, numpy.float64): - return float(value) - if isinstance(value, numpy.int32): - return int(value) - if key.endswith("_index"): - return int(value) - return value - - result = collections.OrderedDict() - for k in sorted(obj.keys()): - result[k] = _encode(k, obj[k]) - - return result - - def yield_files(criteria): """Yields files implied by the criteria. @@ -133,7 +106,7 @@ def dump(obj, overwrite): """ # Set metadata (a JSON serializable ordered dictionary). - metadata = encode(obj) + metadata = encoder.encode(obj) # Set hash id. metadata['_hash_id'] = hashifier.hashify(metadata) diff --git a/cdf2cim/mapper.py b/cdf2cim/mapper.py index a495a83..4d20cd8 100644 --- a/cdf2cim/mapper.py +++ b/cdf2cim/mapper.py @@ -62,23 +62,6 @@ def execute(identifier, properties, dates): if v: cim2_properties[prop] = ', '.join(sorted(v)) - # Include all items from extra3 from all files, omitting - # duplicates, as a list - extra3 = { - 'dataset_versions': [], - 'filenames' : [], - } - - for p in properties: - for x, v in extra3.items(): - v.append(p.get(x)) - - for prop, v in extra3.items(): - v = set(v) - v.discard(None) - if v: - cim2_properties[prop] = tuple(sorted(v)) - # ------------------------------------------------------------ # The cim2_properties dictionary now contains everything # needed to create CIM2 Enemble, Ensemble Member and diff --git a/cdf2cim/parser.py b/cdf2cim/parser.py index 5ed65ad..45b1c22 100644 --- a/cdf2cim/parser.py +++ b/cdf2cim/parser.py @@ -93,9 +93,8 @@ def parse(cf_field): # Add the dataset version to the cim2 properties. It is assumed # that the file path of the file is # /a/load/of/DRS/stuff//filename.nc - cim2_properties['dataset_versions'] = cf_field.fpath.split('/')[-2] - - cim2_properties['filenames'] = cf_field.fpath + # cim2_properties['dataset_versions'] = cf_field.fpath.split('/')[-2] + # cim2_properties['filenames'] = cf_field.fpath # Add the time coordinates' calendar to the cim2 properties try: diff --git a/tests/sample-output/cmip5.json b/tests/sample-output/cmip5.json index 34990e6..9f6d785 100644 --- a/tests/sample-output/cmip5.json +++ b/tests/sample-output/cmip5.json @@ -2,9 +2,6 @@ "branch_time_in_parent": 52560.0, "calendar": "360_day", "contact": "chris.d.jones@metoffice.gov.uk, michael.sanderson@metoffice.gov.uk", - "dataset_versions": [ - "cmip5" - ], "end_time": "2010-12-01 00:00:00", "experiment_id": "rcp85", "forcing": "GHG, SA, Oz, LU, Sl, Vl, BC, OC, (GHG = CO2, N2O, CH4, CFCs)", @@ -22,4 +19,4 @@ "source_id": "HadGEM2-ES", "start_time": "2005-12-01 00:00:00", "_hash_id": "0552ec5f015718532ae73613e053346a" -} \ No newline at end of file +} diff --git a/tests/sample-output/cmip6.json b/tests/sample-output/cmip6.json index 8a97f7b..1f1911f 100644 --- a/tests/sample-output/cmip6.json +++ b/tests/sample-output/cmip6.json @@ -4,9 +4,6 @@ "branch_time_in_parent": "1809-11-12 00:00:00", "calendar": "360_day", "contact": "Python Coder (python@a.b.com) ", - "dataset_versions": [ - "v1" - ], "end_time": "2006-06-01 00:00:00", "experiment_id": "piControl", "forcing_index": 1, @@ -26,4 +23,4 @@ "sub_experiment_id": "none", "variant_info": "forcing: black carbon aerosol only", "_hash_id": "4306ffb5d9abf74f02ab65709c51184b" -} \ No newline at end of file +} diff --git a/tests/test-data/cmip6/v1/tas_0.nc b/tests/test-data/cmip6/v1/tas_0.nc index 8b0d676..d6b62b6 100644 Binary files a/tests/test-data/cmip6/v1/tas_0.nc and b/tests/test-data/cmip6/v1/tas_0.nc differ diff --git a/tests/test_find.py b/tests/test_find.py index 5cd4103..61e9f06 100644 --- a/tests/test_find.py +++ b/tests/test_find.py @@ -64,6 +64,7 @@ def _assert_simulation(obj, expected_fields): """ assert isinstance(obj, dict) assert obj['mip_era'] in constants.MIP_ERA - assert cdf2cim.io_manager.encode(obj) +# assert cdf2cim.io_manager.encode(obj) + assert cdf2cim.encoder.encode(obj) for key in [i for i in expected_fields if not i.startswith('_')]: assert key in obj, (key, expected_fields) diff --git a/tests/test_hash.py b/tests/test_hash.py new file mode 100644 index 0000000..bd778c2 --- /dev/null +++ b/tests/test_hash.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- + +""" +.. module:: test_hash.py + + :license: GPL / CeCILL + :platform: Unix, Windows + :synopsis: Executes hashifier unit tests. + +.. moduleauthor:: Earth System Documentation (ES-DOC) + +""" +import inspect +import json +import os +import tempfile + +import cf + +import cdf2cim +from utils import * + + +def test_is_function(): + """ES-DOC :: cdf2cim :: hashifier :: cdf2cim.hashify function is supported. + + """ + assert inspect.isfunction(cdf2cim.hashifier.hashify) + + +def test_equal_hash_id(): + """ES-DOC :: cdf2cim :: scan + + """ + filename = os.path.join(CMIP6_NETCDF_DIR, 'tas_0.nc') + tmpfile = tempfile.mkstemp('_test_hash.nc', dir=os.getcwd())[1] + + # Create a temporary file that has different, but non-hashable + # properties + f = cf.read(filename, verbose=1)[0] + for attr in cdf2cim.constants.NON_HASH_FIELDS: + f.set_property(attr, 'DIFFERENT VALUE '+tmpfile) + + cf.write(f, tmpfile) + + blob = set() + for f in (filename, tmpfile): + for x in cdf2cim.scan(f): + try: + blob.add(x[0]) + except IndexError: + pass + else: + break + + os.remove(tmpfile) + + # Test that both files produced the same blob, and therefore have + # the same hash + assert len(blob) == 1 diff --git a/tests/test_io_encode.py b/tests/test_io_encode.py index bb7e912..e387e28 100644 --- a/tests/test_io_encode.py +++ b/tests/test_io_encode.py @@ -29,7 +29,8 @@ def test_encode(): """ for obj in cdf2cim.find(NETCDF_DIR): - assert isinstance(cdf2cim.io_manager.encode(obj), dict) +# assert isinstance(cdf2cim.io_manager.encode(obj), dict) + assert isinstance(cdf2cim.encoder.encode(obj), dict) def test_json_conversion_failure(): @@ -46,4 +47,5 @@ def test_convert_to_json(): """ for obj in cdf2cim.find(NETCDF_DIR): - assert json.dumps(cdf2cim.io_manager.encode(obj)) +# assert json.dumps(cdf2cim.io_manager.encode(obj)) + assert json.dumps(cdf2cim.encoder.encode(obj))