Converting MFP CSV to YAML schedule (#111)

iuryt · pre-commit-ci[bot] · VeckoTheGecko · web-flow · commit 0357ac47da71 · 2025-02-14T15:53:06.000+01:00
* draft script for converting MFP CSV to YAML schedule * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add openpyxl * add mfp_to_yaml function * add new command to init to accept mfp file as input * delete files from scripts/ * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * export the schedule body instead of saving file * change name of cli param and adapt for new mfp_to_yaml function * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add warning message for time entry on yaml * change to pydantic and change name of variables * add XBT * accept nonetype time * change to Waypoint to BaseModel and add field_serializer for instrument and time * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove restriction for version * add checking for columns from excel file * add unit tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add update comments and var naming * Remove buffering from mfp conversion * update references to Waypoint --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Vecko <36369090+VeckoTheGecko@users.noreply.github.com>
diff --git a/environment.yml b/environment.yml
@@ -12,6 +12,7 @@ dependencies:
   - pip
   - pyyaml
   - copernicusmarine >= 2
+  - openpyxl
 
   # linting
   - pre-commit
diff --git a/src/virtualship/cli/commands.py b/src/virtualship/cli/commands.py
@@ -16,16 +16,26 @@
     hash_to_filename,
 )
 from virtualship.expedition.do_expedition import _get_schedule, do_expedition
-from virtualship.utils import SCHEDULE, SHIP_CONFIG
+from virtualship.utils import SCHEDULE, SHIP_CONFIG, mfp_to_yaml
 
 
 @click.command()
 @click.argument(
     "path",
     type=click.Path(exists=False, file_okay=False, dir_okay=True),
 )
-def init(path):
-    """Initialize a directory for a new expedition, with an example schedule and ship config files."""
+@click.option(
+    "--from-mfp",
+    type=str,
+    default=None,
+    help='Partially initialise a project from an exported xlsx or csv file from NIOZ\' Marine Facilities Planning tool (specifically the "Export Coordinates > DD" option). User edits are required after initialisation.',
+)
+def init(path, from_mfp):
+    """
+    Initialize a directory for a new expedition, with an example schedule and ship config files.
+
+    If --mfp-file is provided, it will generate the schedule from the MPF file instead.
+    """
     path = Path(path)
     path.mkdir(exist_ok=True)
 
@@ -43,7 +53,20 @@ def init(path):
         )
 
     config.write_text(utils.get_example_config())
-    schedule.write_text(utils.get_example_schedule())
+    if from_mfp:
+        mfp_file = Path(from_mfp)
+        # Generate schedule.yaml from the MPF file
+        click.echo(f"Generating schedule from {mfp_file}...")
+        mfp_to_yaml(mfp_file, schedule)
+        click.echo(
+            "\n⚠️  The generated schedule does not contain time values. "
+            "\nPlease edit 'schedule.yaml' and manually add the necessary time values."
+            "\n🕒  Expected time format: 'YYYY-MM-DD HH:MM:SS' (e.g., '2023-10-20 01:00:00').\n"
+        )
+    else:
+        # Create a default example schedule
+        # schedule_body = utils.get_example_schedule()
+        schedule.write_text(utils.get_example_schedule())
 
     click.echo(f"Created '{config.name}' and '{schedule.name}' at {path}.")
 
diff --git a/src/virtualship/expedition/space_time_region.py b/src/virtualship/expedition/space_time_region.py
@@ -42,13 +42,17 @@ def _check_lon_lat_domain(self) -> Self:
 class TimeRange(BaseModel):
     """Defines the temporal boundaries for a space-time region."""
 
-    start_time: datetime
-    end_time: datetime
+    #! TODO: Remove the `| None` for `start_time` and `end_time`, and have the MFP functionality not use pydantic (with testing to avoid codebase drift)
+    start_time: datetime | None = None
+    end_time: datetime | None = None
 
     @model_validator(mode="after")
     def _check_time_range(self) -> Self:
-        if not self.start_time < self.end_time:
-            raise ValueError("start_time must be before end_time")
+        if (
+            self.start_time and self.end_time
+        ):  #! TODO: remove this check once `start_time` and `end_time` are required
+            if not self.start_time < self.end_time:
+                raise ValueError("start_time must be before end_time")
         return self
 
 
diff --git a/src/virtualship/expedition/waypoint.py b/src/virtualship/expedition/waypoint.py
@@ -1,16 +1,23 @@
 """Waypoint class."""
 
-from dataclasses import dataclass
 from datetime import datetime
 
+from pydantic import BaseModel, field_serializer
+
 from ..location import Location
 from .instrument_type import InstrumentType
 
 
-@dataclass
-class Waypoint:
+class Waypoint(BaseModel):
     """A Waypoint to sail to with an optional time and an optional instrument."""
 
     location: Location
     time: datetime | None = None
     instrument: InstrumentType | list[InstrumentType] | None = None
+
+    @field_serializer("instrument")
+    def serialize_instrument(self, instrument):
+        """Ensure InstrumentType is serialized as a string (or list of strings)."""
+        if isinstance(instrument, list):
+            return [inst.value for inst in instrument]
+        return instrument.value if instrument else None
diff --git a/src/virtualship/utils.py b/src/virtualship/utils.py
@@ -2,6 +2,7 @@
 from importlib.resources import files
 from typing import TextIO
 
+import pandas as pd
 import yaml
 from pydantic import BaseModel
 
@@ -37,3 +38,116 @@ def _dump_yaml(model: BaseModel, stream: TextIO) -> str | None:
 def _generic_load_yaml(data: str, model: BaseModel) -> BaseModel:
     """Load a yaml string into a pydantic model."""
     return model.model_validate(yaml.safe_load(data))
+
+
+def mfp_to_yaml(excel_file_path: str, yaml_output_path: str):  # noqa: D417
+    """
+    Generates a YAML file with spatial and temporal information based on instrument data from MFP excel file.
+
+    Parameters
+    ----------
+    - excel_file_path (str): Path to the Excel file containing coordinate and instrument data.
+
+    The function:
+    1. Reads instrument and location data from the Excel file.
+    2. Determines the maximum depth and buffer based on the instruments present.
+    3. Ensures longitude and latitude values remain valid after applying buffer adjustments.
+    4. returns the yaml information.
+
+    """
+    # Importing Schedule and related models from expedition module
+    from virtualship.expedition.instrument_type import InstrumentType
+    from virtualship.expedition.schedule import Schedule
+    from virtualship.expedition.space_time_region import (
+        SpaceTimeRegion,
+        SpatialRange,
+        TimeRange,
+    )
+    from virtualship.expedition.waypoint import Location, Waypoint
+
+    # Expected column headers
+    expected_columns = {"Station Type", "Name", "Latitude", "Longitude", "Instrument"}
+
+    # Read data from Excel
+    coordinates_data = pd.read_excel(excel_file_path)
+
+    # Check if the headers match the expected ones
+    actual_columns = set(coordinates_data.columns)
+
+    missing_columns = expected_columns - actual_columns
+    if missing_columns:
+        raise ValueError(
+            f"Error: Found columns {list(actual_columns)}, but expected columns {list(expected_columns)}. "
+            "Are you sure that you're using the correct export from MFP?"
+        )
+
+    extra_columns = actual_columns - expected_columns
+    if extra_columns:
+        print(
+            f"Warning: Found additional unexpected columns {list(extra_columns)}. "
+            "Manually added columns have no effect. "
+            "If the MFP export format changed, please submit an issue: "
+            "https://github.com/OceanParcels/virtualship/issues."
+        )
+
+    # Drop unexpected columns (optional, only if you want to ensure strict conformity)
+    coordinates_data = coordinates_data[list(expected_columns)]
+
+    # Continue with the rest of the function after validation...
+    coordinates_data = coordinates_data.dropna()
+
+    # maximum depth (in meters), buffer (in degrees) for each instrument
+    instrument_max_depths = {
+        "XBT": 2000,
+        "CTD": 5000,
+        "DRIFTER": 1,
+        "ARGO_FLOAT": 2000,
+    }
+
+    unique_instruments = set()
+
+    for instrument_list in coordinates_data["Instrument"]:
+        instruments = instrument_list.split(", ")
+        unique_instruments |= set(instruments)
+
+    # Determine the maximum depth based on the unique instruments
+    maximum_depth = max(
+        instrument_max_depths.get(instrument, 0) for instrument in unique_instruments
+    )
+
+    spatial_range = SpatialRange(
+        minimum_longitude=coordinates_data["Longitude"].min(),
+        maximum_longitude=coordinates_data["Longitude"].max(),
+        minimum_latitude=coordinates_data["Latitude"].min(),
+        maximum_latitude=coordinates_data["Latitude"].max(),
+        minimum_depth=0,
+        maximum_depth=maximum_depth,
+    )
+
+    # Create space-time region object
+    space_time_region = SpaceTimeRegion(
+        spatial_range=spatial_range,
+        time_range=TimeRange(),
+    )
+
+    # Generate waypoints
+    waypoints = []
+    for _, row in coordinates_data.iterrows():
+        instruments = [
+            InstrumentType(instrument) for instrument in row["Instrument"].split(", ")
+        ]
+        waypoints.append(
+            Waypoint(
+                instrument=instruments,
+                location=Location(latitude=row["Latitude"], longitude=row["Longitude"]),
+            )
+        )
+
+    # Create Schedule object
+    schedule = Schedule(
+        waypoints=waypoints,
+        space_time_region=space_time_region,
+    )
+
+    # Save to YAML file
+    schedule.to_yaml(yaml_output_path)
diff --git a/tests/expedition/test_schedule.py b/tests/expedition/test_schedule.py
@@ -12,9 +12,11 @@ def test_schedule(tmpdir) -> None:
 
     schedule = Schedule(
         waypoints=[
-            Waypoint(Location(0, 0), time=base_time, instrument=None),
+            Waypoint(location=Location(0, 0), time=base_time, instrument=None),
             Waypoint(
-                Location(1, 1), time=base_time + timedelta(hours=1), instrument=None
+                location=Location(1, 1),
+                time=base_time + timedelta(hours=1),
+                instrument=None,
             ),
         ]
     )
diff --git a/tests/expedition/test_simulate_schedule.py b/tests/expedition/test_simulate_schedule.py
@@ -20,8 +20,8 @@ def test_simulate_schedule_feasible() -> None:
     ship_config.ship_speed_meter_per_second = 5.14
     schedule = Schedule(
         waypoints=[
-            Waypoint(Location(0, 0), base_time),
-            Waypoint(Location(0.01, 0), base_time + timedelta(days=1)),
+            Waypoint(location=Location(0, 0), time=base_time),
+            Waypoint(location=Location(0.01, 0), time=base_time + timedelta(days=1)),
         ]
     )
 
@@ -38,8 +38,8 @@ def test_simulate_schedule_too_far() -> None:
     ship_config = ShipConfig.from_yaml("expedition_dir/ship_config.yaml")
     schedule = Schedule(
         waypoints=[
-            Waypoint(Location(0, 0), base_time),
-            Waypoint(Location(1.0, 0), base_time + timedelta(minutes=1)),
+            Waypoint(location=Location(0, 0), time=base_time),
+            Waypoint(location=Location(1.0, 0), time=base_time + timedelta(minutes=1)),
         ]
     )
 
diff --git a/tests/test_mfp_to_yaml.py b/tests/test_mfp_to_yaml.py
@@ -0,0 +1,102 @@
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+from virtualship.expedition.instrument_type import InstrumentType
+from virtualship.expedition.schedule import Schedule
+from virtualship.utils import mfp_to_yaml
+
+# Sample correct MFP data
+VALID_MFP_DATA = pd.DataFrame(
+    {
+        "Station Type": ["A", "B", "C"],
+        "Name": ["Station1", "Station2", "Station3"],
+        "Latitude": [30, 31, 32],
+        "Longitude": [-44, -45, -46],
+        "Instrument": ["CTD, DRIFTER", "ARGO_FLOAT", "XBT, CTD, DRIFTER"],
+    }
+)
+
+# Missing required columns
+MISSING_HEADERS_DATA = pd.DataFrame(
+    {"Station Type": ["A"], "Name": ["Station1"], "Latitude": [10.5]}
+)
+
+# Extra unexpected columns
+EXTRA_HEADERS_DATA = VALID_MFP_DATA.copy()
+EXTRA_HEADERS_DATA["Unexpected Column"] = ["Extra1", "Extra2", "Extra3"]
+
+
+@patch("pandas.read_excel", return_value=VALID_MFP_DATA)
+def test_mfp_to_yaml_success(mock_read_excel, tmp_path):
+    """Test that mfp_to_yaml correctly processes a valid MFP Excel file."""
+    yaml_output_path = tmp_path / "schedule.yaml"
+
+    # Run function (No need to mock open() for YAML, real file is created)
+    mfp_to_yaml("mock_file.xlsx", yaml_output_path)
+
+    # Ensure the YAML file was written
+    assert yaml_output_path.exists()
+
+    # Load YAML and validate contents
+    data = Schedule.from_yaml(yaml_output_path)
+
+    assert len(data.waypoints) == 3
+    assert data.waypoints[0].instrument == [InstrumentType.CTD, InstrumentType.DRIFTER]
+    assert data.waypoints[1].instrument == [InstrumentType.ARGO_FLOAT]
+    assert data.waypoints[2].instrument == [
+        InstrumentType.XBT,
+        InstrumentType.CTD,
+        InstrumentType.DRIFTER,
+    ]
+
+
+@patch("pandas.read_excel", return_value=MISSING_HEADERS_DATA)
+def test_mfp_to_yaml_missing_headers(mock_read_excel, tmp_path):
+    """Test that mfp_to_yaml raises an error when required columns are missing."""
+    yaml_output_path = tmp_path / "schedule.yaml"
+
+    with pytest.raises(
+        ValueError, match="Error: Found columns .* but expected columns .*"
+    ):
+        mfp_to_yaml("mock_file.xlsx", yaml_output_path)
+
+
+@patch("pandas.read_excel", return_value=EXTRA_HEADERS_DATA)
+@patch("builtins.print")  # Capture printed warnings
+def test_mfp_to_yaml_extra_headers(mock_print, mock_read_excel, tmp_path):
+    """Test that mfp_to_yaml prints a warning when extra columns are found."""
+    yaml_output_path = tmp_path / "schedule.yaml"
+
+    # Run function
+    mfp_to_yaml("mock_file.xlsx", yaml_output_path)
+
+    # Ensure a warning message was printed
+    mock_print.assert_any_call(
+        "Warning: Found additional unexpected columns ['Unexpected Column']. "
+        "Manually added columns have no effect. "
+        "If the MFP export format changed, please submit an issue: "
+        "https://github.com/OceanParcels/virtualship/issues."
+    )
+
+
+@patch("pandas.read_excel", return_value=VALID_MFP_DATA)
+def test_mfp_to_yaml_instrument_conversion(mock_read_excel, tmp_path):
+    """Test that instruments are correctly converted into InstrumentType enums."""
+    yaml_output_path = tmp_path / "schedule.yaml"
+
+    # Run function
+    mfp_to_yaml("mock_file.xlsx", yaml_output_path)
+
+    # Load the generated YAML
+    data = Schedule.from_yaml(yaml_output_path)
+
+    assert isinstance(data.waypoints[0].instrument, list)
+    assert data.waypoints[0].instrument == [InstrumentType.CTD, InstrumentType.DRIFTER]
+    assert data.waypoints[1].instrument == [InstrumentType.ARGO_FLOAT]
+    assert data.waypoints[2].instrument == [
+        InstrumentType.XBT,
+        InstrumentType.CTD,
+        InstrumentType.DRIFTER,
+    ]

Original file line number	Diff line number	Diff line change
`@@ -12,9 +12,11 @@ def test_schedule(tmpdir) -> None:`
`12`	`12`
`13`	`13`	`schedule = Schedule(`
`14`	`14`	`waypoints=[`
`15`		`- Waypoint(Location(0, 0), time=base_time, instrument=None),`
	`15`	`+ Waypoint(location=Location(0, 0), time=base_time, instrument=None),`
`16`	`16`	`Waypoint(`
`17`		`- Location(1, 1), time=base_time + timedelta(hours=1), instrument=None`
	`17`	`+ location=Location(1, 1),`
	`18`	`+ time=base_time + timedelta(hours=1),`
	`19`	`+ instrument=None,`
`18`	`20`	`),`
`19`	`21`	`]`
`20`	`22`	`)`
Original file line number	Diff line number	Diff line change
`@@ -20,8 +20,8 @@ def test_simulate_schedule_feasible() -> None:`
`20`	`20`	`ship_config.ship_speed_meter_per_second = 5.14`
`21`	`21`	`schedule = Schedule(`
`22`	`22`	`waypoints=[`
`23`		`- Waypoint(Location(0, 0), base_time),`
`24`		`- Waypoint(Location(0.01, 0), base_time + timedelta(days=1)),`
	`23`	`+ Waypoint(location=Location(0, 0), time=base_time),`
	`24`	`+ Waypoint(location=Location(0.01, 0), time=base_time + timedelta(days=1)),`
`25`	`25`	`]`
`26`	`26`	`)`
`27`	`27`
`@@ -38,8 +38,8 @@ def test_simulate_schedule_too_far() -> None:`
`38`	`38`	`ship_config = ShipConfig.from_yaml("expedition_dir/ship_config.yaml")`
`39`	`39`	`schedule = Schedule(`
`40`	`40`	`waypoints=[`
`41`		`- Waypoint(Location(0, 0), base_time),`
`42`		`- Waypoint(Location(1.0, 0), base_time + timedelta(minutes=1)),`
	`41`	`+ Waypoint(location=Location(0, 0), time=base_time),`
	`42`	`+ Waypoint(location=Location(1.0, 0), time=base_time + timedelta(minutes=1)),`
`43`	`43`	`]`
`44`	`44`	`)`
`45`	`45`