Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ECCC functionality and create Preprocess module #165

Draft
wants to merge 46 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
7a9d15c
WIP - major refactoring of ECCC
Zeitsperre Jun 19, 2023
d1a8c67
WIP - more units handling
Zeitsperre Jun 20, 2023
94fe866
WIP - more refactoring - AHCCD incomplete
Zeitsperre Jun 20, 2023
d63523b
broken - refactoring of station writer
Zeitsperre Jun 21, 2023
500ae2a
broken - more refactoring
Zeitsperre Jul 6, 2023
c781287
broken - more refactoring
Zeitsperre Jul 6, 2023
8cf4368
Merge branch 'main' into fix_eccc
Zeitsperre Jul 31, 2023
ce2f6d9
significant refactoring - WIP
Zeitsperre Aug 2, 2023
9baaad5
working version of ahccd conversion
Zeitsperre Aug 3, 2023
53fc8f0
working version of ahccd conversion
Zeitsperre Aug 7, 2023
f454cb6
naming and more dynamic handling of variables
Zeitsperre Aug 7, 2023
9339f30
working version
Zeitsperre Aug 7, 2023
e096824
begin work on obs-summaries
Zeitsperre Aug 7, 2023
268cca1
finishing touches on ahccd
Zeitsperre Aug 9, 2023
8728c66
significant refactoring
Zeitsperre Aug 9, 2023
010d3ea
reduce amount of unit conversions
Zeitsperre Aug 10, 2023
7d8fdf9
refactoring - move treatments to new module, load_json_data_mappings …
Zeitsperre Aug 14, 2023
318957e
more refactoring
Zeitsperre Aug 14, 2023
42b8056
merge main into fix-eccc
Zeitsperre Feb 29, 2024
f5ca682
fix metadata, adjust tests
Zeitsperre Feb 29, 2024
412681c
Merge branch 'main' into fix_eccc
Zeitsperre Mar 4, 2024
c2e442a
error handling
Zeitsperre Mar 6, 2024
f5fce92
better error handling
Zeitsperre Mar 6, 2024
75e8015
import fixes
Zeitsperre Mar 6, 2024
c41e687
Merge branch 'main' into fix_eccc
Zeitsperre Mar 18, 2024
8c4ffcc
fix logic
Zeitsperre Mar 18, 2024
448ba07
fix logic
Zeitsperre Mar 18, 2024
53b9c35
fix logic
Zeitsperre Mar 18, 2024
159fc94
add files to sdist
Zeitsperre Mar 18, 2024
586f4e6
synchronize dependencies
Zeitsperre Mar 18, 2024
6c5586b
debugging
Zeitsperre Mar 18, 2024
e2762d3
chase down bug
Zeitsperre Mar 18, 2024
d20be40
update attr treatments
Zeitsperre Mar 19, 2024
74e7949
fixes
Zeitsperre Mar 19, 2024
42b957d
fix folder name
Zeitsperre Mar 19, 2024
288582f
add support for new `h` freq
Zeitsperre Mar 27, 2024
55ee882
dependencies
Zeitsperre Mar 27, 2024
66a2006
restructure
Zeitsperre Jan 6, 2025
13875cd
update deps
Zeitsperre Jan 6, 2025
6d35a9a
try to negotiate merge
Zeitsperre Jan 6, 2025
fa7be57
remove artifacts
Zeitsperre Jan 6, 2025
8cf5c33
fix style violations
Zeitsperre Jan 6, 2025
e763227
fix bad imports
Zeitsperre Jan 6, 2025
b8756b2
update cookiecutter template
Zeitsperre Jan 6, 2025
0e11f16
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 6, 2025
6e8d062
update conventions
Zeitsperre Jan 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
broken - more refactoring
Zeitsperre committed Jul 6, 2023
commit c781287840afd3d8cc6d055fc1af95fec93fd751
41 changes: 22 additions & 19 deletions miranda/convert/eccc_obs.py
Original file line number Diff line number Diff line change
@@ -27,7 +27,7 @@
from datetime import datetime as dt
from logging import config
from pathlib import Path
from typing import List, Tuple, Union, Type, Any
from typing import Any, List

import dask.dataframe as dd
import numpy as np
@@ -56,22 +56,33 @@
TABLE_DATE = dt.now().strftime("%d %B %Y")


def _fwf_column_definitions(time_frequency: str) -> Tuple[List[str], List[int], List[Type[Union[str, int]]]]:
def _fwf_column_definitions(
time_frequency: str,
) -> tuple[list[str], list[int], list[type[str | int]]]:
"""Return the column names, widths, and data types for the fixed-width format."""

# Preparing the column headers
if time_frequency.lower() in ["h", "hour", "hourly"]:
num_observations = 24
column_names = ["code", "year", "month", "day", "code_var"]
column_widths = [7, 4, 2, 2, 3] + [6, 1] * num_observations
column_widths = [7, 4, 2, 2, 3]
column_dtypes = [str, int, int, int, str]
elif time_frequency.lower() in ["d", "day", "daily"]:
num_observations = 31
column_names = ["code", "year", "month", "code_var"]
column_widths = [7, 4, 2, 3] + [6, 1] * num_observations
column_widths = [7, 4, 2, 3]
column_dtypes = [str, int, int, str]
else:
raise NotImplementedError("`mode` must be 'h'/'hourly or 'd'/'daily'.")

# Add the data columns
for i in range(1, num_observations + 1):
data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"
column_names.append(data_entry)
column_names.append(flag_entry)
column_widths.extend([6, 1] * num_observations)
column_dtypes.extend([str, str])

return column_names, column_widths, column_dtypes


@@ -85,7 +96,12 @@ def _remove_duplicates(ds):


def convert_station(
data: str | os.PathLike, mode: str, using_dask_array: bool = False, *, client: Any, **kwargs
data: str | os.PathLike,
mode: str,
using_dask_array: bool = False,
*,
client: Any,
**kwargs,
):
data = Path(data)
column_names, column_widths, column_dtypes = _fwf_column_definitions(mode)
@@ -362,7 +378,7 @@ def _convert_station_file(

with client(**dask_kwargs) as c:
try:
convert_station(data, mode, using_dask=using_dask)
convert_station(data, mode, using_dask=using_dask, client=c)
except FileNotFoundError:
errored_files.append(data)

@@ -393,15 +409,6 @@ def convert_flat_files(
-------
None
"""
func_time = time.time()

# Preparing the data column headers
for i in range(1, num_observations + 1):
data_entry, flag_entry = f"D{i:0n}", f"F{i:0n}"
column_names.append(data_entry)
column_names.append(flag_entry)
column_dtypes.extend([str, str])

if isinstance(variables, (str, int)):
variables = [variables]

@@ -435,8 +442,6 @@ def convert_flat_files(
errored_files=errored_files,
mode=mode,
variable_code=variable_code,
column_names=column_names,
column_dtypes=column_dtypes,
**metadata,
)
with mp.Pool(processes=n_workers) as pool:
@@ -449,8 +454,6 @@ def convert_flat_files(
"Some files failed to be properly parsed:\n", ", ".join(errored_files)
)

logging.warning(f"Process completed in {time.time() - func_time:.2f} seconds")


def merge_stations(
source_files: str | os.PathLike | None = None,