Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/whats_new.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# What's New

## Unreleased

* Fix kerchunk JSON file filtering.

## v0.12.2 (April 15, 2025)

* fixed issue when setting `vertical_mixing` False for `OpenOil` was not passed through correctly
Expand Down
20 changes: 2 additions & 18 deletions particle_tracking_manager/config_ocean_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,21 +175,7 @@ def create_ocean_model_simulation(
)


def get_file_date_string(name: str, date: datetime) -> str:
"""Get the file date string for the given ocean model name and date."""
if name == "NWGOA":
return f"{date.year}-{str(date.month).zfill(2)}-{str(date.day).zfill(2)}"
elif name == "CIOFSOP":
return f"{date.year}-{str(date.month).zfill(2)}-{str(date.day).zfill(2)}"
elif name == "CIOFS":
return f"{date.year}_{str(date.timetuple().tm_yday - 1).zfill(4)}"
elif name == "CIOFSFRESH":
return f"{date.year}_{str(date.timetuple().tm_yday - 1).zfill(4)}"
else:
raise ValueError(f"get_file_date_string not implemented for {name}.")


function_map: dict[str, Callable[[str, str, str], dict[Any, Any]]] = {
function_map: dict[str, Callable[[datetime, datetime, str], dict[Any, Any]]] = {
"make_nwgoa_kerchunk": make_nwgoa_kerchunk,
"make_ciofs_kerchunk": make_ciofs_kerchunk,
}
Expand All @@ -208,9 +194,7 @@ def loc_local(
start_time = start_sim + timedelta(days=1)
end_time = end_sim - timedelta(days=1)

start = get_file_date_string(name, start_time)
end = get_file_date_string(name, end_time)
return function_map[kerchunk_func_str](start, end, name)
return function_map[kerchunk_func_str](start_time, end_time, name)


def register_on_the_fly(ds_info: dict, ocean_model: str = "ONTHEFLY") -> None:
Expand Down
120 changes: 49 additions & 71 deletions particle_tracking_manager/models/opendrift/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# Standard library imports
from datetime import datetime
from pathlib import Path
from typing import Callable, Iterable, List

# Third-party imports
import fsspec
Expand Down Expand Up @@ -96,15 +97,47 @@ def apply_user_input_ocean_model_specific_changes(
return ds


def make_ciofs_kerchunk(start: str, end: str, name: str) -> dict:
def find_json_files_in_date_range(
fs2,
make_glob_from_year: Callable[[str], str],
start: datetime,
end: datetime,
filename_date_format: str,
) -> List[str]:
"""Find JSON files in a date range and return their paths."""

# only glob start and end year files, order isn't important
if abs(start.year - end.year) > 1:
raise ValueError(
f"Start ({start.year}) and end ({end.year}) "
"dates must be at most 1 year apart."
)
start_year = start.strftime("%Y")
end_year = end.strftime("%Y")
json_list = fs2.glob(make_glob_from_year(start_year))
if end_year != start_year:
json_list += fs2.glob(make_glob_from_year(end_year))

# if going backward in time, swap start and end
from_date, to_date = (end, start) if end < start else (start, end)
return [
pth
for pth in json_list
if (
from_date
<= datetime.strptime(Path(pth).stem, filename_date_format)
<= to_date
)
]


def make_ciofs_kerchunk(start: datetime, end: datetime, name: str) -> dict:
"""_summary_

Parameters
----------
start, end : str
Should be something like "2004_0001" for YYYY_0DDD where DDD is dayofyear
to match the files in the directory, which are by year and day of year
("ciofs_fresh" or "ciofs") or "YYYY-MM-DD" for "aws_ciofs"
start, end : datetime
Start and end time of the simulation.

Returns
-------
Expand All @@ -123,50 +156,21 @@ def make_ciofs_kerchunk(start: str, end: str, name: str) -> dict:

fs2 = fsspec.filesystem("") # local file system to save final jsons to

if name in ["CIOFS", "CIOFSFRESH"]:

if name == "CIOFSOP":
# base for matching
def base_str(a_time: str) -> str:
return f"{output_dir_single_files}/{a_time}_*.json"

date_format = "%Y_0%j"
return f"{output_dir_single_files}/ciofs_{a_time}-*.json"

elif name == "CIOFSOP":
date_format = "ciofs_%Y-%m-%d"

else: # name is "CIOFS" or "CIOFSFRESH"
# base for matching
def base_str(a_time: str) -> str:
return f"{output_dir_single_files}/ciofs_{a_time}-*.json"
return f"{output_dir_single_files}/{a_time}_*.json"

date_format = "ciofs_%Y-%m-%d"
else:
raise ValueError(f"Name {name} not recognized")
date_format = "%Y_0%j"

# only glob start and end year files, order isn't important
json_list = fs2.glob(base_str(start[:4]))
if end[:4] != start[:4]:
json_list += fs2.glob(base_str(end[:4]))

# forward in time
if end > start:
json_list = [
j
for j in json_list
if datetime.strptime(Path(j).stem, date_format).isoformat() >= start
and datetime.strptime(Path(j).stem, date_format).isoformat() <= end
]
# backward in time
elif end < start:
json_list = [
j
for j in json_list
if datetime.strptime(Path(j).stem, date_format).isoformat() <= start
and datetime.strptime(Path(j).stem, date_format).isoformat() >= end
]

if json_list == []:
raise ValueError(
f"No files found in {output_dir_single_files} for {start} to {end}"
)
json_list = find_json_files_in_date_range(fs2, base_str, start, end, date_format)

# Multi-file JSONs
# This code uses the output generated above to create a single ensemble dataset,
Expand Down Expand Up @@ -280,13 +284,13 @@ def postprocess(out: dict) -> dict:
return out


def make_nwgoa_kerchunk(start: str, end: str, name: str = "NWGOA") -> dict:
def make_nwgoa_kerchunk(start: datetime, end: datetime, name: str = "NWGOA") -> dict:
"""_summary_

Parameters
----------
start, end : str
Should be something like "1999-01-02" for YYYY-MM-DD
start, end : datetime
Start and end time of the simulation.

Returns
-------
Expand All @@ -308,33 +312,7 @@ def base_str(a_time: str) -> str:

date_format = "nwgoa_%Y-%m-%d"

# only glob start and end year files, order isn't important
json_list = fs2.glob(base_str(start[:4]))

if end[:4] != start[:4]:
json_list += fs2.glob(base_str(end[:4]))

# forward in time
if end > start:
json_list = [
j
for j in json_list
if datetime.strptime(Path(j).stem, date_format).isoformat() >= start
and datetime.strptime(Path(j).stem, date_format).isoformat() <= end
]
# backward in time
elif end < start:
json_list = [
j
for j in json_list
if datetime.strptime(Path(j).stem, date_format).isoformat() <= start
and datetime.strptime(Path(j).stem, date_format).isoformat() >= end
]

if json_list == []:
raise ValueError(
f"No files found in {output_dir_single_files} for {start} to {end}"
)
json_list = find_json_files_in_date_range(fs2, base_str, start, end, date_format)

# account for double compression
# Look at individual variables in the files to see what needs to be changed with
Expand Down
89 changes: 89 additions & 0 deletions tests/test_config_ocean_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
import particle_tracking_manager

from particle_tracking_manager.config_ocean_model import ocean_model_simulation_mapper
from particle_tracking_manager.models.opendrift.utils import (
find_json_files_in_date_range,
)
from particle_tracking_manager.ocean_model_registry import ocean_model_registry


Expand Down Expand Up @@ -188,6 +191,92 @@ def test_start_end_times():
)


class MockFileSystem:
"""Mock file system to simulate globbing for kerchunk JSON files."""

def __init__(self, files):
self.files = files

def glob(self, year):
# Mock glob method takes a year string instead of a pattern
return [file for file in self.files if year in file]


MOCK_FILES_YEARLY_DATES = [
(1, 1),
(1, 2),
(1, 31),
(3, 15),
(5, 31),
(6, 1),
(6, 15),
(6, 30),
(7, 1),
(9, 15),
(12, 1),
(12, 30),
(12, 31),
]
MOCK_FILES_DATES = [
datetime(year, month, day)
for month, day in MOCK_FILES_YEARLY_DATES
for year in [2019, 2020, 2021]
]
TEST_DATE_RANGES = [
((2020, 6, 5), (2020, 6, 25), 1),
((2020, 1, 1), (2020, 1, 31), 3),
((2020, 1, 31), (2020, 1, 1), 3),
((2019, 12, 1), (2020, 1, 31), 6),
((2019, 12, 31), (2020, 1, 1), 2),
((2020, 1, 1), (2020, 12, 31), 13),
((2020, 1, 3), (2020, 6, 29), 5),
((2020, 6, 29), (2020, 1, 3), 5),
((2019, 12, 31), (2021, 1, 1), None),
((2019, 12, 31), (2021, 12, 31), None),
((2020, 1, 1), (2021, 12, 31), 26),
]


@pytest.mark.parametrize(
"start_tuple, end_tuple, expected",
TEST_DATE_RANGES,
ids=[f"{start}->{end}" for start, end, _ in TEST_DATE_RANGES],
)
@pytest.mark.parametrize(
"filename_format",
["%Y_0%j", "ciofs_%Y-%m-%d", "nwgoa_%Y-%m-%d"],
ids=["CIOFS", "CIOFSOP", "NWGOA"],
)
def test_krchunk_json_filtering(filename_format, start_tuple, end_tuple, expected):
"""Check that kerchunk JSON files are correctly filtered based on date range"""
start = datetime(*start_tuple)
end = datetime(*end_tuple)

all_files = [f"{d.strftime(filename_format)}.json" for d in MOCK_FILES_DATES]
mock_fs = MockFileSystem(all_files)

def make_glob_with_check(year: str) -> str:
"""Check that the input string is a year and return it instead of any pattern.

The return value will be used by `MockFileSystem.glob` which accounts for this.
"""
assert year == f"{start.year:04}" or year == f"{end.year:04}"
return year

try:
jsons = find_json_files_in_date_range(
mock_fs,
make_glob_with_check,
start,
end,
filename_format,
)

assert len(jsons) == expected
except ValueError:
assert expected is None, f"Expected {expected} JSON files but got ValueError"


def test_user_registry():

# Create a temporary directory
Expand Down
Loading