From 4c8d505aabbe93d0e64eb847d6c7cdda5c341136 Mon Sep 17 00:00:00 2001
From: Greg Lucas <greg.m.lucas@gmail.com>
Date: Mon, 26 Aug 2024 06:53:33 -0600
Subject: [PATCH] FIX: Avoid loss of precision when casting in packet loading

When using derived values there can be situations where a linear
conversion factor is applied to a uint8 value to turn a raw measurement
into a float temperature value for instance. These are represented
as a small uint datatype onboard, but need to be represented as a
float or larger integer datatype on the ground so we don't lose
precision. Previously we were getting 2.1 cast to 2 after the
derived types were attempted to be cast to their onboard types.
---
 imap_processing/tests/test_utils.py | 28 ++++++++++++++++++++++------
 imap_processing/utils.py            | 12 ++++++++++--
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/imap_processing/tests/test_utils.py b/imap_processing/tests/test_utils.py
index 787289a6d..6f42dfa4b 100644
--- a/imap_processing/tests/test_utils.py
+++ b/imap_processing/tests/test_utils.py
@@ -97,10 +97,26 @@ def test_packet_file_to_datasets(use_derived_value, expected_mode):
     np.testing.assert_array_equal(data["mode"], [expected_mode] * len(data["mode"]))
 
 
-def test__create_minimum_dtype_array():
+@pytest.mark.parametrize(
+    ("arr", "dtype", "expected_dtype"),
+    # Expected basic case
+    [
+        ([1, 2, 3], "uint8", "uint8"),
+        # We shouldn't go lower than requested either
+        ([1, 2, 3], "uint16", "uint16"),
+        # Can't cast negative, fallback to default
+        ([-1, 2, 3], "uint8", "int64"),
+        # Small signed ints should be good
+        ([-1, 2, 3], "int8", "int8"),
+        # Can't cast strings to ints, fallback to default
+        (["a", "b", "c"], "uint8", "<U1"),
+        # Can't cast floats to ints, fallback to default
+        ([1, 2.5, 3], "uint8", "float64"),
+        # Can't cast larger ints, fallback to default
+        ([1, 1000, 2000], "uint8", "int64"),
+    ],
+)
+def test__create_minimum_dtype_array(arr, dtype, expected_dtype):
     """Test expected return types for minimum data types."""
-    result = utils._create_minimum_dtype_array([1, 2, 3], "uint8")
-    assert result.dtype == np.dtype("uint8")
-    # fallback to a generic array if the requested dtype can't be satisfied
-    result = utils._create_minimum_dtype_array(["a", "b", "c"], "uint8")
-    assert result.dtype == np.dtype("<U1")
+    result = utils._create_minimum_dtype_array(arr, dtype)
+    assert result.dtype == np.dtype(expected_dtype)
diff --git a/imap_processing/utils.py b/imap_processing/utils.py
index 680cc3f2d..a74b22120 100644
--- a/imap_processing/utils.py
+++ b/imap_processing/utils.py
@@ -299,10 +299,18 @@ def _create_minimum_dtype_array(values: list, dtype: str) -> npt.NDArray:
     array : np.array
         The array of values.
     """
+    # Create an initial array and then try to safely cast it to the desired dtype
+    x = np.asarray(values)
     try:
-        return np.array(values, dtype=dtype)
+        # ValueError: when trying to cast strings (enum states) to ints
+        y = x.astype(dtype, copy=False)
+        # We need to compare the arrays to see if we trimmed any values by
+        # casting to a smaller datatype (e.g. float64 to uint8, 2.1 to 2)
+        if np.array_equal(x, y):
+            return y
     except ValueError:
-        return np.array(values)
+        pass
+    return x
 
 
 def packet_file_to_datasets(