Skip to content

Commit 56d41e6

Browse files
committed
update pre-download ingestion methods to take files split by time
1 parent e364d00 commit 56d41e6

File tree

3 files changed

+60
-55
lines changed

3 files changed

+60
-55
lines changed

src/virtualship/instruments/base.py

Lines changed: 55 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import abc
2+
import re
23
from collections import OrderedDict
3-
from datetime import timedelta
4+
from datetime import datetime, timedelta
45
from pathlib import Path
56
from typing import TYPE_CHECKING
67

@@ -156,64 +157,46 @@ def _get_copernicus_ds(
156157
coordinates_selection_method="outside",
157158
)
158159

159-
def _load_local_ds(self, filename) -> xr.Dataset:
160-
"""
161-
Load local dataset from specified data directory.
162-
163-
Sliced according to expedition.schedule.space_time_region and buffer specs.
164-
"""
165-
ds = xr.open_dataset(self.from_data.joinpath(filename))
166-
167-
coord_rename = {}
168-
if "lat" in ds.coords:
169-
coord_rename["lat"] = "latitude"
170-
if "lon" in ds.coords:
171-
coord_rename["lon"] = "longitude"
172-
if coord_rename:
173-
ds = ds.rename(coord_rename)
174-
175-
min_lon = (
176-
self.expedition.schedule.space_time_region.spatial_range.minimum_longitude
177-
- self._get_spec_value(
178-
"buffer", "latlon", 3.0
179-
) # always add min 3 deg buffer for local data to avoid edge issues with ds.sel()
180-
)
181-
max_lon = (
182-
self.expedition.schedule.space_time_region.spatial_range.maximum_longitude
183-
+ self._get_spec_value("buffer", "latlon", 3.0)
184-
)
185-
min_lat = (
186-
self.expedition.schedule.space_time_region.spatial_range.minimum_latitude
187-
- self._get_spec_value("buffer", "latlon", 3.0)
188-
)
189-
max_lat = (
190-
self.expedition.schedule.space_time_region.spatial_range.maximum_latitude
191-
+ self._get_spec_value("buffer", "latlon", 3.0)
192-
)
193-
194-
return ds.sel(
195-
latitude=slice(min_lat, max_lat),
196-
longitude=slice(min_lon, max_lon),
197-
)
198-
199160
def _generate_fieldset(self) -> FieldSet:
200161
"""
201162
Create and combine FieldSets for each variable, supporting both local and Copernicus Marine data sources.
202163
203-
Avoids issues when using copernicusmarine and creating directly one FieldSet of ds's sourced from different Copernicus Marine product IDs, which is often the case for BGC variables.
164+
Per variable avoids issues when using copernicusmarine and creating directly one FieldSet of ds's sourced from different Copernicus Marine product IDs, which is often the case for BGC variables.
204165
"""
205166
fieldsets_list = []
206167
keys = list(self.variables.keys())
207168

208169
for key in keys:
209170
var = self.variables[key]
210171
if self.from_data is not None: # load from local data
211-
filename, full_var_name = _find_nc_file_with_variable(
212-
self.from_data, var
172+
physical = var in COPERNICUSMARINE_PHYS_VARIABLES
173+
if physical:
174+
data_dir = self.from_data.joinpath("phys")
175+
else:
176+
data_dir = self.from_data.joinpath("bgc")
177+
178+
schedule_start = (
179+
self.expedition.schedule.space_time_region.time_range.start_time
213180
)
214-
ds = self._load_local_ds(filename)
215-
fs = FieldSet.from_xarray_dataset(
216-
ds, {key: full_var_name}, self.dimensions, mesh="spherical"
181+
schedule_end = (
182+
self.expedition.schedule.space_time_region.time_range.end_time
183+
)
184+
185+
files = self._find_files_in_timerange(
186+
data_dir,
187+
schedule_start,
188+
schedule_end,
189+
)
190+
191+
_, full_var_name = _find_nc_file_with_variable(
192+
data_dir, var
193+
) # get full variable name from one of the files; var may only appear as substring in variable name in file
194+
195+
fs = FieldSet.from_netcdf(
196+
filenames=[data_dir.joinpath(f) for f in files],
197+
variables={key: full_var_name},
198+
dimensions=self.dimensions,
199+
mesh="spherical",
217200
)
218201
else: # steam via Copernicus Marine
219202
physical = var in COPERNICUSMARINE_PHYS_VARIABLES
@@ -233,3 +216,28 @@ def _get_spec_value(self, spec_type: str, key: str, default=None):
233216
"""Helper to extract a value from buffer_spec or limit_spec."""
234217
spec = self.buffer_spec if spec_type == "buffer" else self.limit_spec
235218
return spec.get(key) if spec and spec.get(key) is not None else default
219+
220+
def _find_files_in_timerange(
221+
self,
222+
data_dir: Path,
223+
schedule_start,
224+
schedule_end,
225+
date_pattern=r"\d{4}_\d{2}_\d{2}",
226+
date_fmt="%Y_%m_%d",
227+
) -> list:
228+
"""Find all files in data_dir whose filenames contain a date within [schedule_start, schedule_end] (inclusive)."""
229+
# TODO: scope to make this more flexible for different date patterns / formats ...
230+
files_with_dates = []
231+
start_date = schedule_start.date() # normalise to date only for comparison (given start/end dates have hour/minute components which may exceed those in file_date)
232+
end_date = schedule_end.date()
233+
for file in data_dir.iterdir():
234+
if file.is_file():
235+
match = re.search(date_pattern, file.name)
236+
if match:
237+
file_date = datetime.strptime(match.group(), date_fmt).date()
238+
if start_date <= file_date <= end_date:
239+
files_with_dates.append((file_date, file.name))
240+
files_with_dates.sort(
241+
key=lambda x: x[0]
242+
) # sort by extracted date; more robust than relying on filesystem order
243+
return [fname for _, fname in files_with_dates]

src/virtualship/models/expedition.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import itertools
44
from datetime import datetime, timedelta
55
from pathlib import Path
6-
from typing import TYPE_CHECKING
76

87
import numpy as np
98
import pydantic
@@ -17,10 +16,6 @@
1716
from .location import Location
1817
from .space_time_region import SpaceTimeRegion
1918

20-
if TYPE_CHECKING:
21-
pass
22-
23-
2419
projection: pyproj.Geod = pyproj.Geod(ellps="WGS84")
2520

2621

src/virtualship/utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -419,13 +419,15 @@ def _get_bathy_data(
419419
"""Bathymetry data from local or 'streamed' directly from Copernicus Marine."""
420420
if from_data is not None: # load from local data
421421
var = "deptho"
422+
bathy_dir = from_data.joinpath("bathymetry/")
422423
try:
423-
filename, _ = _find_nc_file_with_variable(from_data, var)
424+
filename, _ = _find_nc_file_with_variable(bathy_dir, var)
424425
except Exception as e:
426+
# TODO: link to documentation on expected data structure!!
425427
raise RuntimeError(
426-
f"Could not find bathymetry variable '{var}' in provided data directory '{from_data}'."
428+
f"\n\n❗️ Could not find bathymetry variable '{var}' in data directory '{from_data}/bathymetry/'.\n\n❗️ Is the pre-downloaded data directory structure compliant with VirtualShip expectations?\n\n❗️ See for more information on expectations: <<<INSERT LINK TO DOCS>>>\n"
427429
) from e
428-
ds_bathymetry = xr.open_dataset(from_data.joinpath(filename))
430+
ds_bathymetry = xr.open_dataset(bathy_dir.joinpath(filename))
429431
bathymetry_variables = {"bathymetry": "deptho"}
430432
bathymetry_dimensions = {"lon": "longitude", "lat": "latitude"}
431433
return FieldSet.from_xarray_dataset(

0 commit comments

Comments
 (0)