Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cdf2cim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
__date__ = "2019-01-18"
__license__ = "GPL/CeCILL-2.1"
__title__ = "cdf2cim"
__version__ = "1.0.2.0"
__version__ = "1.1.0"



Expand Down
10 changes: 10 additions & 0 deletions cdf2cim/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@
MIP_ERA_CMIP6
}

# Set of fields to exclude from hash derivation.
NON_HASH_FIELDS = (
'contact',
'references',
'forcing',
'variant_info',
"filenames",
"dataset_versions",
)

# Supported project codes.
CMIP5 = 'CMIP5'
CMIP6 = 'CMIP6'
Expand Down
44 changes: 44 additions & 0 deletions cdf2cim/encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
.. module:: encoder.py
:license: GPL/CeCIL
:platform: Unix, Windows
:synopsis: Encodes cdf2cim data prior to I/O operation.

.. moduleauthor:: David Hassell <[email protected]>


"""
import collections

import numpy

from cdf2cim import hashifier
from cdf2cim.constants import NON_HASH_FIELDS



def encode(obj: dict) -> collections.OrderedDict:
"""Encodes output from map/reduce as a JSON safe dictionary.

:param dict obj: Output from a map/reduce job.

:returns: A JSON safe dictionary

"""
def _encode(key, value):
"""Encodes a value.

"""
if isinstance(value, numpy.float64):
return float(value)
if isinstance(value, numpy.int32):
return int(value)
if key.endswith("_index"):
return int(value)
return value

result = collections.OrderedDict()
for k in sorted(obj.keys()):
result[k] = _encode(k, obj[k])

return result
15 changes: 8 additions & 7 deletions cdf2cim/hashifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,21 @@


"""
import collections
import json
import hashlib

from cdf2cim.constants import NON_HASH_FIELDS


def hashify(metadata):
"""Returns hashes dervied from a cdf2cim metadata blob.
def hashify(metadata: collections.OrderedDict) -> str:
"""Returns hashes derived from a cdf2cim metadata blob.

:param dict metadata: Simulation metadata.

"""
metadata_as_text = json.dumps(metadata)
hash_id = hashlib.md5(metadata_as_text.encode('utf-8')).hexdigest()
hash_id = f"{hash_id}{metadata['start_time']}{metadata['end_time']}"
hash_id = hashlib.md5(hash_id.encode('utf-8')).hexdigest()
target = metadata.copy()
for field in NON_HASH_FIELDS:
target.pop(field, None)

return hash_id
return hashlib.md5(json.dumps(target).encode('utf-8')).hexdigest()
31 changes: 2 additions & 29 deletions cdf2cim/io_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import numpy

from cdf2cim import exceptions
from cdf2cim import encoder
from cdf2cim import hashifier
from cdf2cim import logger
from cdf2cim.constants import FILE_STATUS_PUBLISHED
Expand All @@ -27,34 +28,6 @@



def encode(obj):
"""Encodes an output from a map/reduce as a JSON safe dictionary.

:param dict obj: Output from a map/reduce job.

:returns: A JSON safe dictionary
:rtype: dict

"""
def _encode(key, value):
"""Encodes a value.

"""
if isinstance(value, numpy.float64):
return float(value)
if isinstance(value, numpy.int32):
return int(value)
if key.endswith("_index"):
return int(value)
return value

result = collections.OrderedDict()
for k in sorted(obj.keys()):
result[k] = _encode(k, obj[k])

return result


def yield_files(criteria):
"""Yields files implied by the criteria.

Expand Down Expand Up @@ -133,7 +106,7 @@ def dump(obj, overwrite):

"""
# Set metadata (a JSON serializable ordered dictionary).
metadata = encode(obj)
metadata = encoder.encode(obj)

# Set hash id.
metadata['_hash_id'] = hashifier.hashify(metadata)
Expand Down
17 changes: 0 additions & 17 deletions cdf2cim/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,23 +62,6 @@ def execute(identifier, properties, dates):
if v:
cim2_properties[prop] = ', '.join(sorted(v))

# Include all items from extra3 from all files, omitting
# duplicates, as a list
extra3 = {
'dataset_versions': [],
'filenames' : [],
}

for p in properties:
for x, v in extra3.items():
v.append(p.get(x))

for prop, v in extra3.items():
v = set(v)
v.discard(None)
if v:
cim2_properties[prop] = tuple(sorted(v))

# ------------------------------------------------------------
# The cim2_properties dictionary now contains everything
# needed to create CIM2 Enemble, Ensemble Member and
Expand Down
5 changes: 2 additions & 3 deletions cdf2cim/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,8 @@ def parse(cf_field):
# Add the dataset version to the cim2 properties. It is assumed
# that the file path of the file is
# /a/load/of/DRS/stuff/<VERSION>/filename.nc
cim2_properties['dataset_versions'] = cf_field.fpath.split('/')[-2]

cim2_properties['filenames'] = cf_field.fpath
# cim2_properties['dataset_versions'] = cf_field.fpath.split('/')[-2]
# cim2_properties['filenames'] = cf_field.fpath

# Add the time coordinates' calendar to the cim2 properties
try:
Expand Down
5 changes: 1 addition & 4 deletions tests/sample-output/cmip5.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
"branch_time_in_parent": 52560.0,
"calendar": "360_day",
"contact": "[email protected], [email protected]",
"dataset_versions": [
"cmip5"
],
"end_time": "2010-12-01 00:00:00",
"experiment_id": "rcp85",
"forcing": "GHG, SA, Oz, LU, Sl, Vl, BC, OC, (GHG = CO2, N2O, CH4, CFCs)",
Expand All @@ -22,4 +19,4 @@
"source_id": "HadGEM2-ES",
"start_time": "2005-12-01 00:00:00",
"_hash_id": "0552ec5f015718532ae73613e053346a"
}
}
5 changes: 1 addition & 4 deletions tests/sample-output/cmip6.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
"branch_time_in_parent": "1809-11-12 00:00:00",
"calendar": "360_day",
"contact": "Python Coder ([email protected]) ",
"dataset_versions": [
"v1"
],
"end_time": "2006-06-01 00:00:00",
"experiment_id": "piControl",
"forcing_index": 1,
Expand All @@ -26,4 +23,4 @@
"sub_experiment_id": "none",
"variant_info": "forcing: black carbon aerosol only",
"_hash_id": "4306ffb5d9abf74f02ab65709c51184b"
}
}
Binary file modified tests/test-data/cmip6/v1/tas_0.nc
Binary file not shown.
3 changes: 2 additions & 1 deletion tests/test_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def _assert_simulation(obj, expected_fields):
"""
assert isinstance(obj, dict)
assert obj['mip_era'] in constants.MIP_ERA
assert cdf2cim.io_manager.encode(obj)
# assert cdf2cim.io_manager.encode(obj)
assert cdf2cim.encoder.encode(obj)
for key in [i for i in expected_fields if not i.startswith('_')]:
assert key in obj, (key, expected_fields)
60 changes: 60 additions & 0 deletions tests/test_hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-

"""
.. module:: test_hash.py

:license: GPL / CeCILL
:platform: Unix, Windows
:synopsis: Executes hashifier unit tests.

.. moduleauthor:: Earth System Documentation (ES-DOC) <[email protected]>

"""
import inspect
import json
import os
import tempfile

import cf

import cdf2cim
from utils import *


def test_is_function():
"""ES-DOC :: cdf2cim :: hashifier :: cdf2cim.hashify function is supported.

"""
assert inspect.isfunction(cdf2cim.hashifier.hashify)


def test_equal_hash_id():
"""ES-DOC :: cdf2cim :: scan

"""
filename = os.path.join(CMIP6_NETCDF_DIR, 'tas_0.nc')
tmpfile = tempfile.mkstemp('_test_hash.nc', dir=os.getcwd())[1]

# Create a temporary file that has different, but non-hashable
# properties
f = cf.read(filename, verbose=1)[0]
for attr in cdf2cim.constants.NON_HASH_FIELDS:
f.set_property(attr, 'DIFFERENT VALUE '+tmpfile)

cf.write(f, tmpfile)

blob = set()
for f in (filename, tmpfile):
for x in cdf2cim.scan(f):
try:
blob.add(x[0])
except IndexError:
pass
else:
break

os.remove(tmpfile)

# Test that both files produced the same blob, and therefore have
# the same hash
assert len(blob) == 1
6 changes: 4 additions & 2 deletions tests/test_io_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def test_encode():

"""
for obj in cdf2cim.find(NETCDF_DIR):
assert isinstance(cdf2cim.io_manager.encode(obj), dict)
# assert isinstance(cdf2cim.io_manager.encode(obj), dict)
assert isinstance(cdf2cim.encoder.encode(obj), dict)


def test_json_conversion_failure():
Expand All @@ -46,4 +47,5 @@ def test_convert_to_json():

"""
for obj in cdf2cim.find(NETCDF_DIR):
assert json.dumps(cdf2cim.io_manager.encode(obj))
# assert json.dumps(cdf2cim.io_manager.encode(obj))
assert json.dumps(cdf2cim.encoder.encode(obj))