Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: Avoid loss of precision when casting in packet loading (alternative) #786

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions imap_processing/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,3 @@ def test_packet_file_to_datasets_flat_definition():
)
with pytest.raises(ValueError, match="Packet fields do not match"):
utils.packet_file_to_datasets(packet_files, packet_definition)


def test__create_minimum_dtype_array():
"""Test expected return types for minimum data types."""
result = utils._create_minimum_dtype_array([1, 2, 3], "uint8")
assert result.dtype == np.dtype("uint8")
# fallback to a generic array if the requested dtype can't be satisfied
result = utils._create_minimum_dtype_array(["a", "b", "c"], "uint8")
assert result.dtype == np.dtype("<U1")
54 changes: 21 additions & 33 deletions imap_processing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import Optional, Union

import numpy as np
import numpy.typing as npt
import pandas as pd
import xarray as xr
from space_packet_parser import parser, xtcedef
Expand Down Expand Up @@ -228,8 +227,8 @@ def create_dataset(


def _get_minimum_numpy_datatype( # noqa: PLR0912 - Too many branches
name: str, definition: xtcedef.XtcePacketDefinition
) -> str:
name: str, definition: xtcedef.XtcePacketDefinition, use_derived_value: bool = True
) -> Optional[str]:
"""
Get the minimum datatype for a given variable.

Expand All @@ -239,6 +238,8 @@ def _get_minimum_numpy_datatype( # noqa: PLR0912 - Too many branches
The variable name.
definition : xtcedef.XtcePacketDefinition
The XTCE packet definition.
use_derived_value : bool, default True
Whether or not the derived value from the XTCE definition was used.

Returns
-------
Expand All @@ -247,7 +248,21 @@ def _get_minimum_numpy_datatype( # noqa: PLR0912 - Too many branches
"""
data_encoding = definition.named_parameters[name].parameter_type.encoding

if isinstance(data_encoding, xtcedef.NumericDataEncoding):
if use_derived_value and isinstance(
definition.named_parameters[name].parameter_type,
xtcedef.EnumeratedParameterType,
):
# We don't have a way of knowing what is enumerated,
# let numpy infer the datatype
return None
elif isinstance(data_encoding, xtcedef.NumericDataEncoding):
if use_derived_value and (
data_encoding.context_calibrators is not None
or data_encoding.default_calibrator is not None
):
# If there are calibrators, we need to default to None and
# let numpy infer the datatype
return None
nbits = data_encoding.size_in_bits
if isinstance(data_encoding, xtcedef.IntegerDataEncoding):
datatype = "int"
Expand Down Expand Up @@ -280,31 +295,6 @@ def _get_minimum_numpy_datatype( # noqa: PLR0912 - Too many branches
return datatype


def _create_minimum_dtype_array(values: list, dtype: str) -> npt.NDArray:
"""
Create an array with the minimum datatype.

If it can't be coerced to that datatype, fallback to general array creation
without a specific datatype. This can happen with derived values.

Parameters
----------
values : list
List of values.
dtype : str
The datatype.

Returns
-------
array : np.array
The array of values.
"""
try:
return np.array(values, dtype=dtype)
except ValueError:
return np.array(values)


def packet_file_to_datasets(
packet_file: Union[str, Path],
xtce_packet_definition: Union[str, Path],
Expand Down Expand Up @@ -384,7 +374,7 @@ def packet_file_to_datasets(
if key not in datatype_mapping[apid]:
# Add this datatype to the mapping
datatype_mapping[apid][key] = _get_minimum_numpy_datatype(
key, packet_definition
key, packet_definition, use_derived_value=use_derived_value
)

dataset_by_apid = {}
Expand All @@ -398,9 +388,7 @@ def packet_file_to_datasets(
{
key.lower(): (
"epoch",
_create_minimum_dtype_array(
list_of_values, dtype=datatype_mapping[apid][key]
),
np.asarray(list_of_values, dtype=datatype_mapping[apid][key]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does numpy default to 64-bit floats? And if so, do we have any way to get the CDF products to use 32-bit floats where we want them?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes and yes.

I am actually getting 32-bit floats in another packet definition where the floats are brought down directly in the telemetry stream, so space_packet_parser emits the Python floats and then we can use the bits to determine 32/64 bits correctly:

elif isinstance(data_encoding, xtcedef.FloatDataEncoding):
datatype = "float"
if nbits == 32:
datatype += "32"
else:
datatype += "64"

This situation is purely for when a conversion factor gets applied and we just shrug and say, not much I can know about it from the packet definition since you told me it was unsigned 8 bits in the packet, but then you multiplied it by 2.5 and all bets are off for what you want me to turn that into (float, new int, string enumeration, ...).

Ultimately, this gets to your earlier suggestions though that this is really just a "first cut" and we should be putting dtype into the CDF metadata attributes yaml definitions and doing the casting there when creating the datasets and saving the products.

)
for key, list_of_values in data.items()
},
Expand Down
Loading