Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5c5bf9a
draft script for converting MFP CSV to YAML schedule
iuryt Jan 28, 2025
40c7ff9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 28, 2025
366dede
add openpyxl
iuryt Feb 3, 2025
d046444
add mfp_to_yaml function
iuryt Feb 3, 2025
e3199fa
add new command to init to accept mfp file as input
iuryt Feb 3, 2025
d9fe46a
delete files from scripts/
iuryt Feb 3, 2025
7dc9bd7
deleted scripts files
iuryt Feb 3, 2025
a79433c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 3, 2025
11332f8
export the schedule body instead of saving file
iuryt Feb 13, 2025
ad54992
change name of cli param and adapt for new mfp_to_yaml function
iuryt Feb 13, 2025
66adb18
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 13, 2025
2672afa
add warning message for time entry on yaml
iuryt Feb 13, 2025
a370641
change to pydantic and change name of variables
iuryt Feb 13, 2025
b87d944
add XBT
iuryt Feb 13, 2025
eba08b8
accept nonetype time
iuryt Feb 13, 2025
c0a52ac
change to Waypoint to BaseModel and add field_serializer for instrume…
iuryt Feb 13, 2025
526d2af
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 13, 2025
c51043d
remove restriction for version
iuryt Feb 14, 2025
f3daaa7
add checking for columns from excel file
iuryt Feb 14, 2025
4c59420
add unit tests
iuryt Feb 14, 2025
b67b15d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 14, 2025
6f63cd4
Add update comments and var naming
VeckoTheGecko Feb 14, 2025
222df85
Remove buffering from mfp conversion
VeckoTheGecko Feb 14, 2025
c94567b
update references to Waypoint
VeckoTheGecko Feb 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- pip
- pyyaml
- copernicusmarine >= 2
- openpyxl

# linting
- pre-commit
Expand Down
31 changes: 27 additions & 4 deletions src/virtualship/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,26 @@
hash_to_filename,
)
from virtualship.expedition.do_expedition import _get_schedule, do_expedition
from virtualship.utils import SCHEDULE, SHIP_CONFIG
from virtualship.utils import SCHEDULE, SHIP_CONFIG, mfp_to_yaml


@click.command()
@click.argument(
"path",
type=click.Path(exists=False, file_okay=False, dir_okay=True),
)
def init(path):
"""Initialize a directory for a new expedition, with an example schedule and ship config files."""
@click.option(
"--mfp-file",
type=str,
default=None,
help='Partially initialise a project from an exported xlsx or csv file from NIOZ\' Marine Facilities Planning tool (specifically the "Export Coordinates > DD" option). User edits are required after initialisation.',
)
def init(path, mfp_file):
"""
Initialize a directory for a new expedition, with an example schedule and ship config files.

If --mfp-file is provided, it will generate the schedule from the MPF file instead.
"""
path = Path(path)
path.mkdir(exist_ok=True)

Expand All @@ -43,7 +53,20 @@ def init(path):
)

config.write_text(utils.get_example_config())
schedule.write_text(utils.get_example_schedule())

if mfp_file:
# Generate schedule.yaml from the MPF file
click.echo(f"Generating schedule from {mfp_file}...")
schedule_body = mfp_to_yaml(mfp_file, schedule)
click.echo(
"\n⚠️ The generated schedule does not contain time values. "
"\nPlease edit 'schedule.yaml' and manually add the necessary time values."
"\n🕒 Expected time format: 'YYYY-MM-DD HH:MM:SS' (e.g., '2023-10-20 01:00:00').\n"
)
else:
# Create a default example schedule
# schedule_body = utils.get_example_schedule()
schedule.write_text(utils.get_example_schedule())

click.echo(f"Created '{config.name}' and '{schedule.name}' at {path}.")

Expand Down
1 change: 1 addition & 0 deletions src/virtualship/expedition/instrument_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ class InstrumentType(Enum):
CTD = "CTD"
DRIFTER = "DRIFTER"
ARGO_FLOAT = "ARGO_FLOAT"
XBT = "XBT"
9 changes: 5 additions & 4 deletions src/virtualship/expedition/space_time_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,14 @@ def _check_lon_lat_domain(self) -> Self:
class TimeRange(BaseModel):
"""Defines the temporal boundaries for a space-time region."""

start_time: datetime
end_time: datetime
start_time: datetime | None = None
end_time: datetime | None = None

@model_validator(mode="after")
def _check_time_range(self) -> Self:
if not self.start_time < self.end_time:
raise ValueError("start_time must be before end_time")
if self.start_time and self.end_time:
if not self.start_time < self.end_time:
raise ValueError("start_time must be before end_time")
return self
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@VeckoTheGecko @ammedd
While using pydantic, I had to change this to accept nonetype time, but this means that when we do schedule.from_yaml() for fetch it will not give any error, right? What should we do?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I thought that pydantic would have a way of disabling validation during the initialisation of the object, but looking further at the documentation its looking that that isn't possible ... . Longterm it would be good to have start_time and end_time not none, but perhaps thats something for a future PR



Expand Down
18 changes: 15 additions & 3 deletions src/virtualship/expedition/waypoint.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
"""Waypoint class."""

from dataclasses import dataclass
from datetime import datetime

from pydantic import BaseModel, field_serializer

from ..location import Location
from .instrument_type import InstrumentType


@dataclass
class Waypoint:
class Waypoint(BaseModel):
"""A Waypoint to sail to with an optional time and an optional instrument."""

location: Location
time: datetime | None = None
instrument: InstrumentType | list[InstrumentType] | None = None

@field_serializer("instrument")
def serialize_instrument(self, instrument):
"""Ensure InstrumentType is serialized as a string (or list of strings)."""
if isinstance(instrument, list):
return [inst.value for inst in instrument]
return instrument.value if instrument else None

@field_serializer("time")
def serialize_time(self, time):
"""Ensure datetime is formatted properly in YAML."""
return time.strftime("%Y-%m-%d %H:%M:%S") if time else None

This comment was marked as outdated.

Copy link
Collaborator

@VeckoTheGecko VeckoTheGecko Feb 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see why this was done now - that InstrumentType serialization wasn't happening as expected. Ok, let's keep this (though still not sure about the time serialization as that looks to be ok? reverting that part for the timebeing, but can introduce in future PR)

131 changes: 131 additions & 0 deletions src/virtualship/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from importlib.resources import files
from typing import TextIO

import pandas as pd
import yaml
from pydantic import BaseModel

Expand Down Expand Up @@ -37,3 +38,133 @@ def _dump_yaml(model: BaseModel, stream: TextIO) -> str | None:
def _generic_load_yaml(data: str, model: BaseModel) -> BaseModel:
"""Load a yaml string into a pydantic model."""
return model.model_validate(yaml.safe_load(data))


def mfp_to_yaml(excel_file_path: str, yaml_output_path: str):
"""
Generates a YAML file with spatial and temporal information based on instrument data from MFP excel file.

Parameters
----------
- excel_file_path (str): Path to the Excel file containing coordinate and instrument data.

The function:
1. Reads instrument and location data from the Excel file.
2. Determines the maximum depth and buffer based on the instruments present.
3. Ensures longitude and latitude values remain valid after applying buffer adjustments.
4. returns the yaml information.

"""
# Importing Schedule and related models from expedition module
from virtualship.expedition.instrument_type import InstrumentType
from virtualship.expedition.schedule import Schedule
from virtualship.expedition.space_time_region import (
SpaceTimeRegion,
SpatialRange,
TimeRange,
)
from virtualship.expedition.waypoint import Location, Waypoint

# Expected column headers
expected_columns = {"Station Type", "Name", "Latitude", "Longitude", "Instrument"}

# Read data from Excel
coordinates_data = pd.read_excel(excel_file_path)

# Check if the headers match the expected ones
actual_columns = set(coordinates_data.columns)

missing_columns = expected_columns - actual_columns
if missing_columns:
raise ValueError(
f"Error: Found columns {list(actual_columns)}, but expected columns {list(expected_columns)}. "
"Are you sure that you're using the correct export from MFP?"
)

extra_columns = actual_columns - expected_columns
if extra_columns:
print(
f"Warning: Found additional unexpected columns {list(extra_columns)}. "
"Manually added columns have no effect. "
"If the MFP export format changed, please submit an issue: "
"https://github.com/OceanParcels/virtualship/issues."
)

# Drop unexpected columns (optional, only if you want to ensure strict conformity)
coordinates_data = coordinates_data[list(expected_columns)]

# Continue with the rest of the function after validation...
coordinates_data = coordinates_data.dropna()

# Define maximum depth (in meters) and buffer (in degrees) for each instrument
instrument_properties = {
"XBT": {"maximum_depth": 2000, "buffer": 1},
"CTD": {"maximum_depth": 5000, "buffer": 1},
"DRIFTER": {"maximum_depth": 1, "buffer": 5},
"ARGO_FLOAT": {"maximum_depth": 2000, "buffer": 5},
}

# Extract unique instruments from dataset using a set
unique_instruments = set()

for instrument_list in coordinates_data["Instrument"]:
instruments = instrument_list.split(
", "
) # Split by ", " to get individual instruments
unique_instruments |= set(instruments) # Union update with set of instruments

# Determine the maximum depth based on the unique instruments
maximum_depth = max(
instrument_properties.get(inst, {"maximum_depth": 0})["maximum_depth"]
for inst in unique_instruments
)
minimum_depth = 0

# Determine the buffer based on the maximum buffer of the instruments present
buffer = max(
instrument_properties.get(inst, {"buffer": 0})["buffer"]
for inst in unique_instruments
)

# Adjusted spatial range
min_longitude = coordinates_data["Longitude"].min() - buffer
max_longitude = coordinates_data["Longitude"].max() + buffer
min_latitude = coordinates_data["Latitude"].min() - buffer
max_latitude = coordinates_data["Latitude"].max() + buffer

spatial_range = SpatialRange(
minimum_longitude=coordinates_data["Longitude"].min() - buffer,
maximum_longitude=coordinates_data["Longitude"].max() + buffer,
minimum_latitude=coordinates_data["Latitude"].min() - buffer,
maximum_latitude=coordinates_data["Latitude"].max() + buffer,
minimum_depth=0,
maximum_depth=maximum_depth,
)

# Create space-time region object
space_time_region = SpaceTimeRegion(
spatial_range=spatial_range,
time_range=TimeRange(),
)

# Generate waypoints
waypoints = []
for _, row in coordinates_data.iterrows():
instruments = [
InstrumentType(instrument) for instrument in row["Instrument"].split(", ")
]
waypoints.append(
Waypoint(
instrument=instruments,
location=Location(latitude=row["Latitude"], longitude=row["Longitude"]),
)
)

# Create Schedule object
schedule = Schedule(
waypoints=waypoints,
space_time_region=space_time_region,
)

# Save to YAML file
schedule.to_yaml(yaml_output_path)
102 changes: 102 additions & 0 deletions tests/test_mfp_to_yaml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from unittest.mock import patch

import pandas as pd
import pytest

from virtualship.expedition.instrument_type import InstrumentType
from virtualship.expedition.schedule import Schedule
from virtualship.utils import mfp_to_yaml

# Sample correct MFP data
VALID_MFP_DATA = pd.DataFrame(
{
"Station Type": ["A", "B", "C"],
"Name": ["Station1", "Station2", "Station3"],
"Latitude": [30, 31, 32],
"Longitude": [-44, -45, -46],
"Instrument": ["CTD, DRIFTER", "ARGO_FLOAT", "XBT, CTD, DRIFTER"],
}
)

# Missing required columns
MISSING_HEADERS_DATA = pd.DataFrame(
{"Station Type": ["A"], "Name": ["Station1"], "Latitude": [10.5]}
)

# Extra unexpected columns
EXTRA_HEADERS_DATA = VALID_MFP_DATA.copy()
EXTRA_HEADERS_DATA["Unexpected Column"] = ["Extra1", "Extra2", "Extra3"]


@patch("pandas.read_excel", return_value=VALID_MFP_DATA)
def test_mfp_to_yaml_success(mock_read_excel, tmp_path):
"""Test that mfp_to_yaml correctly processes a valid MFP Excel file."""
yaml_output_path = tmp_path / "schedule.yaml"

# Run function (No need to mock open() for YAML, real file is created)
mfp_to_yaml("mock_file.xlsx", yaml_output_path)

# Ensure the YAML file was written
assert yaml_output_path.exists()

# Load YAML and validate contents
data = Schedule.from_yaml(yaml_output_path)

assert len(data.waypoints) == 3
assert data.waypoints[0].instrument == [InstrumentType.CTD, InstrumentType.DRIFTER]
assert data.waypoints[1].instrument == [InstrumentType.ARGO_FLOAT]
assert data.waypoints[2].instrument == [
InstrumentType.XBT,
InstrumentType.CTD,
InstrumentType.DRIFTER,
]


@patch("pandas.read_excel", return_value=MISSING_HEADERS_DATA)
def test_mfp_to_yaml_missing_headers(mock_read_excel, tmp_path):
"""Test that mfp_to_yaml raises an error when required columns are missing."""
yaml_output_path = tmp_path / "schedule.yaml"

with pytest.raises(
ValueError, match="Error: Found columns .* but expected columns .*"
):
mfp_to_yaml("mock_file.xlsx", yaml_output_path)


@patch("pandas.read_excel", return_value=EXTRA_HEADERS_DATA)
@patch("builtins.print") # Capture printed warnings
def test_mfp_to_yaml_extra_headers(mock_print, mock_read_excel, tmp_path):
"""Test that mfp_to_yaml prints a warning when extra columns are found."""
yaml_output_path = tmp_path / "schedule.yaml"

# Run function
mfp_to_yaml("mock_file.xlsx", yaml_output_path)

# Ensure a warning message was printed
mock_print.assert_any_call(
"Warning: Found additional unexpected columns ['Unexpected Column']. "
"Manually added columns have no effect. "
"If the MFP export format changed, please submit an issue: "
"https://github.com/OceanParcels/virtualship/issues."
)


@patch("pandas.read_excel", return_value=VALID_MFP_DATA)
def test_mfp_to_yaml_instrument_conversion(mock_read_excel, tmp_path):
"""Test that instruments are correctly converted into InstrumentType enums."""
yaml_output_path = tmp_path / "schedule.yaml"

# Run function
mfp_to_yaml("mock_file.xlsx", yaml_output_path)

# Load the generated YAML
data = Schedule.from_yaml(yaml_output_path)

assert isinstance(data.waypoints[0].instrument, list)
assert data.waypoints[0].instrument == [InstrumentType.CTD, InstrumentType.DRIFTER]
assert data.waypoints[1].instrument == [InstrumentType.ARGO_FLOAT]
assert data.waypoints[2].instrument == [
InstrumentType.XBT,
InstrumentType.CTD,
InstrumentType.DRIFTER,
]
Loading