Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

94 add yearly frequency to ddlpymeasurements #95

Merged
merged 6 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ History

UNRELEASED
----------
* allow for different retrieval frequencies (including None) in `ddlpy.measurements()` https://github.com/Deltares/ddlpy/pull/95
* avoid duplicated periods in dataframe returned by `ddlpy.measurements_amount()` https://github.com/Deltares/ddlpy/pull/93

0.4.0 (2024-04-08)
Expand Down
45 changes: 33 additions & 12 deletions ddlpy/ddlpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def _clean_dataframe(measurements):
# drop duplicate rows (preserves e.g. different Grootheden/Groeperingen at same timestep)
measurements = measurements.drop_duplicates()

# remove Tijdstap column, has to be done after drop_duplicates to avoid too much to be dropped
# remove Tijdstip column, has to be done after drop_duplicates to avoid too much to be dropped
measurements = measurements.drop("Tijdstip", axis=1, errors='ignore')

# sort dataframe on time, ddl returns non-sorted data
Expand All @@ -328,8 +328,30 @@ def _clean_dataframe(measurements):
return measurements


def measurements(location, start_date, end_date, clean_df=True):
"""return measurements for the given location and time window (start_date, end_date)"""
def measurements(location, start_date, end_date, freq=dateutil.rrule.MONTHLY, clean_df=True):
"""
Return measurements for the given location and time window (start_date, end_date)

Parameters
----------
location : pd.Series
Single row of the `ddlpy.locations()` DataFrame.
start_date : str, dt.datetime, pd.Timestamp
Start of the retrieval period.
end_date : str, dt.datetime, pd.Timestamp
End of the retrieval period.
freq : None, dateutil.rrule.MONTHLY, dateutil.rrule.YEARLY, etc., optional
The frequency in which to divide the requested period (e.g. yearly or monthly).
Can also be None, in which case the entire dataset will be retrieved at once.
Please note that 10-minute measurements can often not be downloaded in yearly (or larger) chunks
since the DDL limits the responses to 157681 values and several stations have duplicated timesteps.
In that case the query will fail with an error or timeout or just return an empty result (as if there was no data).
In that case, the user should fallback to monthly chunks.
This is significantly slower but it is also much more robust. The default is dateutil.rrule.MONTHLY.
clean_df : bool, optional
Whether to sort the dataframe and remove duplicate rows. The default is True.

"""

if isinstance(location, pd.DataFrame):
raise TypeError("The provided location is a pandas.DataFrame, but should be a pandas.Series, "
Expand All @@ -346,15 +368,14 @@ def measurements(location, start_date, end_date, clean_df=True):
# logger.debug("no data found for this station and time extent")
# return

for (start_date_i, end_date_i) in tqdm.tqdm(
date_series(start_date, end_date, freq=dateutil.rrule.MONTHLY)
):
"""return measurements for station given by locations record \"location\", from start_date through end_date
IMPORTANT: measurements made every 10 minutes will not be downoladed if freq= YEAR.
For instance if many duplicate timesteps are present, it will fail or timeout.
Therefore, Please DO NOT CHANGE THE FREQUENCY TO YEAR. KEEP IT MONTHLY NO MATTER HOW SLOW THE CODE CAN BE!
"""

if freq is None:
date_series_iterator = tqdm.tqdm([(start_date, end_date)])
else:
date_series_iterator = tqdm.tqdm(
date_series(start_date, end_date, freq=freq)
)

for (start_date_i, end_date_i) in date_series_iterator:
try:
measurement = _measurements_slice(
location, start_date=start_date_i, end_date=end_date_i
Expand Down
15 changes: 15 additions & 0 deletions tests/test_ddlpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import pytest
import ddlpy
import dateutil


@pytest.fixture
Expand Down Expand Up @@ -55,6 +56,20 @@ def test_measurements(measurements):
assert measurements.shape[0] > 1


def test_measurements_freq_yearly(location, measurements):
start_date = dt.datetime(1953, 1, 1)
end_date = dt.datetime(1953, 4, 1)
measurements_yearly = ddlpy.measurements(location, start_date=start_date, end_date=end_date, freq=dateutil.rrule.YEARLY)
assert measurements.shape == measurements_yearly.shape


def test_measurements_freq_none(location, measurements):
start_date = dt.datetime(1953, 1, 1)
end_date = dt.datetime(1953, 4, 1)
measurements_yearly = ddlpy.measurements(location, start_date=start_date, end_date=end_date, freq=None)
assert measurements.shape == measurements_yearly.shape


def test_measurements_available(location):
start_date = dt.datetime(1953, 1, 1)
end_date = dt.datetime(1953, 4, 1)
Expand Down
Loading