Skip to content

Commit eb594f1

Browse files
committed
fix kerchunk JSON file filtering
- refactor to deduplicate code - simplify how dates are passed around for the kerchunk functions - and this way fix the date format mismatch when filtering the JSON files
1 parent 50afc3d commit eb594f1

File tree

3 files changed

+55
-89
lines changed

3 files changed

+55
-89
lines changed

docs/whats_new.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# What's New
22

3+
## Unreleased
4+
5+
* Fix kerchunk JSON file filtering.
6+
37
## v0.12.1 (April 8, 2025)
48

59
* Correction to `interpolator_filename` handling and improvement in testing.

particle_tracking_manager/config_ocean_model.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -175,21 +175,7 @@ def create_ocean_model_simulation(
175175
)
176176

177177

178-
def get_file_date_string(name: str, date: datetime) -> str:
179-
"""Get the file date string for the given ocean model name and date."""
180-
if name == "NWGOA":
181-
return f"{date.year}-{str(date.month).zfill(2)}-{str(date.day).zfill(2)}"
182-
elif name == "CIOFSOP":
183-
return f"{date.year}-{str(date.month).zfill(2)}-{str(date.day).zfill(2)}"
184-
elif name == "CIOFS":
185-
return f"{date.year}_{str(date.timetuple().tm_yday - 1).zfill(4)}"
186-
elif name == "CIOFSFRESH":
187-
return f"{date.year}_{str(date.timetuple().tm_yday - 1).zfill(4)}"
188-
else:
189-
raise ValueError(f"get_file_date_string not implemented for {name}.")
190-
191-
192-
function_map: dict[str, Callable[[str, str, str], dict[Any, Any]]] = {
178+
function_map: dict[str, Callable[[datetime, datetime, str], dict[Any, Any]]] = {
193179
"make_nwgoa_kerchunk": make_nwgoa_kerchunk,
194180
"make_ciofs_kerchunk": make_ciofs_kerchunk,
195181
}
@@ -208,9 +194,7 @@ def loc_local(
208194
start_time = start_sim + timedelta(days=1)
209195
end_time = end_sim - timedelta(days=1)
210196

211-
start = get_file_date_string(name, start_time)
212-
end = get_file_date_string(name, end_time)
213-
return function_map[kerchunk_func_str](start, end, name)
197+
return function_map[kerchunk_func_str](start_time, end_time, name)
214198

215199

216200
def register_on_the_fly(ds_info: dict, ocean_model: str = "ONTHEFLY") -> None:

particle_tracking_manager/models/opendrift/utils.py

Lines changed: 49 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# Standard library imports
77
from datetime import datetime
88
from pathlib import Path
9+
from typing import Callable, Iterable, List
910

1011
# Third-party imports
1112
import fsspec
@@ -96,15 +97,47 @@ def apply_user_input_ocean_model_specific_changes(
9697
return ds
9798

9899

99-
def make_ciofs_kerchunk(start: str, end: str, name: str) -> dict:
100+
def find_json_files_in_date_range(
101+
fs2,
102+
make_glob_from_year: Callable[[str], str],
103+
start: datetime,
104+
end: datetime,
105+
filename_date_format: str,
106+
) -> List[str]:
107+
"""Find JSON files in a date range and return their paths."""
108+
109+
# only glob start and end year files, order isn't important
110+
if abs(start.year - end.year) > 1:
111+
raise ValueError(
112+
f"Start ({start.year}) and end ({end.year}) "
113+
"dates must be at most 1 year apart."
114+
)
115+
start_year = start.strftime("%Y")
116+
end_year = end.strftime("%Y")
117+
json_list = fs2.glob(make_glob_from_year(start_year))
118+
if end_year != start_year:
119+
json_list += fs2.glob(make_glob_from_year(end_year))
120+
121+
def filter_paths(start: datetime, end: datetime, paths: Iterable[str]) -> List[str]:
122+
if end < start:
123+
# if going backward in time, swap start and end
124+
end, start = start, end
125+
return [
126+
pth
127+
for pth in paths
128+
if start <= datetime.strptime(Path(pth).stem, filename_date_format) <= end
129+
]
130+
131+
return filter_paths(start, end, json_list)
132+
133+
134+
def make_ciofs_kerchunk(start: datetime, end: datetime, name: str) -> dict:
100135
"""_summary_
101136
102137
Parameters
103138
----------
104-
start, end : str
105-
Should be something like "2004_0001" for YYYY_0DDD where DDD is dayofyear
106-
to match the files in the directory, which are by year and day of year
107-
("ciofs_fresh" or "ciofs") or "YYYY-MM-DD" for "aws_ciofs"
139+
start, end : datetime
140+
Start and end time of the simulation.
108141
109142
Returns
110143
-------
@@ -123,50 +156,21 @@ def make_ciofs_kerchunk(start: str, end: str, name: str) -> dict:
123156

124157
fs2 = fsspec.filesystem("") # local file system to save final jsons to
125158

126-
if name in ["CIOFS", "CIOFSFRESH"]:
127-
159+
if name == "CIOFSOP":
128160
# base for matching
129161
def base_str(a_time: str) -> str:
130-
return f"{output_dir_single_files}/{a_time}_*.json"
131-
132-
date_format = "%Y_0%j"
162+
return f"{output_dir_single_files}/ciofs_{a_time}-*.json"
133163

134-
elif name == "CIOFSOP":
164+
date_format = "ciofs_%Y-%m-%d"
135165

166+
else: # name is "CIOFS" or "CIOFSFRESH"
136167
# base for matching
137168
def base_str(a_time: str) -> str:
138-
return f"{output_dir_single_files}/ciofs_{a_time}-*.json"
139-
140-
date_format = "ciofs_%Y-%m-%d"
141-
else:
142-
raise ValueError(f"Name {name} not recognized")
169+
return f"{output_dir_single_files}/{a_time}_*.json"
143170

144-
# only glob start and end year files, order isn't important
145-
json_list = fs2.glob(base_str(start[:4]))
146-
if end[:4] != start[:4]:
147-
json_list += fs2.glob(base_str(end[:4]))
148-
149-
# forward in time
150-
if end > start:
151-
json_list = [
152-
j
153-
for j in json_list
154-
if datetime.strptime(Path(j).stem, date_format).isoformat() >= start
155-
and datetime.strptime(Path(j).stem, date_format).isoformat() <= end
156-
]
157-
# backward in time
158-
elif end < start:
159-
json_list = [
160-
j
161-
for j in json_list
162-
if datetime.strptime(Path(j).stem, date_format).isoformat() <= start
163-
and datetime.strptime(Path(j).stem, date_format).isoformat() >= end
164-
]
171+
date_format = "%Y_0%j"
165172

166-
if json_list == []:
167-
raise ValueError(
168-
f"No files found in {output_dir_single_files} for {start} to {end}"
169-
)
173+
json_list = find_json_files_in_date_range(fs2, base_str, start, end, date_format)
170174

171175
# Multi-file JSONs
172176
# This code uses the output generated above to create a single ensemble dataset,
@@ -280,13 +284,13 @@ def postprocess(out: dict) -> dict:
280284
return out
281285

282286

283-
def make_nwgoa_kerchunk(start: str, end: str, name: str = "NWGOA") -> dict:
287+
def make_nwgoa_kerchunk(start: datetime, end: datetime, name: str = "NWGOA") -> dict:
284288
"""_summary_
285289
286290
Parameters
287291
----------
288-
start, end : str
289-
Should be something like "1999-01-02" for YYYY-MM-DD
292+
start, end : datetime
293+
Start and end time of the simulation.
290294
291295
Returns
292296
-------
@@ -308,33 +312,7 @@ def base_str(a_time: str) -> str:
308312

309313
date_format = "nwgoa_%Y-%m-%d"
310314

311-
# only glob start and end year files, order isn't important
312-
json_list = fs2.glob(base_str(start[:4]))
313-
314-
if end[:4] != start[:4]:
315-
json_list += fs2.glob(base_str(end[:4]))
316-
317-
# forward in time
318-
if end > start:
319-
json_list = [
320-
j
321-
for j in json_list
322-
if datetime.strptime(Path(j).stem, date_format).isoformat() >= start
323-
and datetime.strptime(Path(j).stem, date_format).isoformat() <= end
324-
]
325-
# backward in time
326-
elif end < start:
327-
json_list = [
328-
j
329-
for j in json_list
330-
if datetime.strptime(Path(j).stem, date_format).isoformat() <= start
331-
and datetime.strptime(Path(j).stem, date_format).isoformat() >= end
332-
]
333-
334-
if json_list == []:
335-
raise ValueError(
336-
f"No files found in {output_dir_single_files} for {start} to {end}"
337-
)
315+
json_list = find_json_files_in_date_range(fs2, base_str, start, end, date_format)
338316

339317
# account for double compression
340318
# Look at individual variables in the files to see what needs to be changed with

0 commit comments

Comments
 (0)