Skip to content

Commit e2bfdc2

Browse files
committed
Add tests for l0 production from raw netCDFs
1 parent 9a01cd3 commit e2bfdc2

File tree

7 files changed

+411
-346
lines changed

7 files changed

+411
-346
lines changed

Diff for: disdrodb/l0/check_readers.py

+109-131
Original file line numberDiff line numberDiff line change
@@ -22,77 +22,40 @@
2222
import shutil
2323

2424
import pandas as pd
25+
import xarray as xr
2526

2627
from disdrodb import __root_path__
27-
from disdrodb.api.path import define_metadata_dir, get_disdrodb_path
28-
from disdrodb.l0.l0_reader import get_station_reader_function
28+
from disdrodb.api.io import available_stations
29+
from disdrodb.api.path import define_campaign_dir, define_station_dir
30+
from disdrodb.l0.l0_processing import run_l0a_station
31+
from disdrodb.metadata import read_station_metadata
2932
from disdrodb.utils.directories import list_files
3033

3134
TEST_BASE_DIR = os.path.join(__root_path__, "disdrodb", "tests", "data", "check_readers", "DISDRODB")
3235

3336

34-
def _get_list_test_data_sources() -> list:
35-
"""Get list of test data sources.
36-
37-
Returns
38-
-------
39-
list
40-
List of test data sources.
41-
"""
42-
43-
data_sources = os.listdir(os.path.join(TEST_BASE_DIR, "Raw"))
44-
return data_sources
45-
46-
47-
def _get_list_test_campaigns(data_source: str) -> list:
48-
"""Get list of test campaigns for a given data source.
49-
50-
Parameters
51-
----------
52-
data_source : str
53-
Data source.
54-
55-
Returns
56-
-------
57-
list
58-
List of test campaigns.
59-
60-
"""
61-
campaign_names = os.listdir(os.path.join(TEST_BASE_DIR, "Raw", data_source))
62-
return campaign_names
63-
64-
65-
def _get_list_test_stations(data_source: str, campaign_name: str) -> list:
66-
"""Get list of test stations for a given data source and campaign.
37+
def _check_identical_netcdf_files(file1: str, file2: str) -> bool:
38+
"""Check if two L0B netCDF files are identical.
6739
6840
Parameters
6941
----------
70-
data_source : str
71-
Data source.
72-
73-
campaign_name : str
74-
Name of the campaign.
75-
76-
Returns
77-
-------
78-
list
79-
List of test stations.
42+
file1 : str
43+
Path to the first file.
8044
45+
file2 : str
46+
Path to the second file.
8147
"""
82-
metadata_dir = define_metadata_dir(
83-
product="RAW",
84-
base_dir=TEST_BASE_DIR,
85-
data_source=data_source,
86-
campaign_name=campaign_name,
87-
check_exists=False,
88-
)
89-
filepaths = list_files(metadata_dir, glob_pattern="*.yml", recursive=False)
90-
list_station_names = [os.path.splitext(os.path.basename(i))[0] for i in filepaths]
48+
# Open files
49+
ds1 = xr.open_dataset(file1)
50+
ds2 = xr.open_dataset(file2)
51+
# Remove attributes that depends on processing time
52+
ds1.attrs.pop("disdrodb_processing_date", None)
53+
ds2.attrs.pop("disdrodb_processing_date", None)
54+
# Assert equality
55+
xr.testing.assert_identical(ds1, ds2)
9156

92-
return list_station_names
9357

94-
95-
def _is_parquet_files_identical(file1: str, file2: str) -> bool:
58+
def _check_identical_parquet_files(file1: str, file2: str) -> bool:
9659
"""Check if two parquet files are identical.
9760
9861
Parameters
@@ -102,64 +65,79 @@ def _is_parquet_files_identical(file1: str, file2: str) -> bool:
10265
10366
file2 : str
10467
Path to the second file.
105-
106-
Returns
107-
-------
108-
bool
109-
True if the two files are identical, False otherwise.
110-
11168
"""
11269
df1 = pd.read_parquet(file1)
11370
df2 = pd.read_parquet(file2)
114-
return df1.equals(df2)
71+
if not df1.equals(df2):
72+
raise ValueError("The two Parquet files differ.")
11573

11674

117-
def _run_reader_on_test_data(data_source: str, campaign_name: str) -> None:
118-
"""Run reader over the test data sample.
75+
def _check_station_reader_results(
76+
base_dir,
77+
data_source,
78+
campaign_name,
79+
station_name,
80+
):
81+
raw_dir = define_campaign_dir(
82+
base_dir=TEST_BASE_DIR,
83+
product="RAW",
84+
data_source=data_source,
85+
campaign_name=campaign_name,
86+
)
11987

120-
Parameters
121-
----------
122-
data_source : str
123-
Data source.
124-
campaign_name : str
125-
Campaign name.
126-
"""
127-
station_names = _get_list_test_stations(data_source=data_source, campaign_name=campaign_name)
128-
for station_name in station_names:
129-
reader = get_station_reader_function(
130-
base_dir=TEST_BASE_DIR,
131-
data_source=data_source,
132-
campaign_name=campaign_name,
133-
station_name=station_name,
134-
)
135-
136-
# Define campaign_name raw_dir and process_dir
137-
raw_dir = get_disdrodb_path(
138-
base_dir=TEST_BASE_DIR,
139-
product="RAW",
140-
data_source=data_source,
141-
campaign_name=campaign_name,
142-
)
143-
144-
processed_dir = get_disdrodb_path(
145-
base_dir=TEST_BASE_DIR,
146-
product="L0A",
147-
data_source=data_source,
148-
campaign_name=campaign_name,
149-
check_exists=False,
150-
)
151-
# Call the reader
152-
reader(
153-
raw_dir=raw_dir,
154-
processed_dir=processed_dir,
155-
station_name=station_name,
156-
force=True,
157-
verbose=False,
158-
debugging_mode=False,
159-
parallel=False,
160-
)
161-
162-
return processed_dir
88+
run_l0a_station(
89+
base_dir=TEST_BASE_DIR,
90+
data_source=data_source,
91+
campaign_name=campaign_name,
92+
station_name=station_name,
93+
force=True,
94+
verbose=False,
95+
debugging_mode=False,
96+
parallel=False,
97+
)
98+
99+
metadata = read_station_metadata(
100+
base_dir=TEST_BASE_DIR,
101+
product="L0A",
102+
data_source=data_source,
103+
campaign_name=campaign_name,
104+
station_name=station_name,
105+
)
106+
raw_data_format = metadata["raw_data_format"]
107+
if raw_data_format == "netcdf":
108+
glob_pattern = "*.nc"
109+
check_identical_files = _check_identical_netcdf_files
110+
product = "L0B"
111+
else: # raw_data_format == "txt"
112+
glob_pattern = "*.parquet"
113+
check_identical_files = _check_identical_parquet_files
114+
product = "L0A"
115+
116+
ground_truth_station_dir = os.path.join(raw_dir, "ground_truth", station_name)
117+
processed_station_dir = define_station_dir(
118+
base_dir=TEST_BASE_DIR,
119+
product=product,
120+
data_source=data_source,
121+
campaign_name=campaign_name,
122+
station_name=station_name,
123+
)
124+
125+
# Retrieve files
126+
ground_truth_files = sorted(list_files(ground_truth_station_dir, glob_pattern=glob_pattern, recursive=True))
127+
processed_files = sorted(list_files(processed_station_dir, glob_pattern=glob_pattern, recursive=True))
128+
129+
# Check same number of files
130+
n_groud_truth = len(ground_truth_files)
131+
n_processed = len(processed_files)
132+
if n_groud_truth != n_processed:
133+
raise ValueError(f"{n_groud_truth} ground truth files but only {n_processed} are prfoduced.")
134+
135+
# Compare equality of files
136+
for ground_truth_filepath, processed_filepath in zip(ground_truth_files, processed_files):
137+
try:
138+
check_identical_files(ground_truth_filepath, processed_filepath)
139+
except Exception:
140+
raise ValueError(f"Reader validation has failed for '{data_source}' '{campaign_name}' '{station_name}'")
163141

164142

165143
def check_all_readers() -> None:
@@ -171,28 +149,28 @@ def check_all_readers() -> None:
171149
If the reader validation has failed.
172150
"""
173151

174-
for data_source in _get_list_test_data_sources():
175-
for campaign_name in _get_list_test_campaigns(data_source):
176-
process_dir = _run_reader_on_test_data(data_source, campaign_name)
177-
ground_truth_dir = os.path.join(TEST_BASE_DIR, "Raw", data_source, campaign_name, "ground_truth")
178-
processed_product_dir = os.path.join(process_dir, "L0A")
179-
180-
glob_pattern = os.path.join("*", "*.parquet")
181-
ground_truth_files = list_files(ground_truth_dir, glob_pattern=glob_pattern, recursive=False)
182-
processed_files = list_files(processed_product_dir, glob_pattern=glob_pattern, recursive=False)
183-
184-
for ground_truth_filepath, processed_file_filepath in zip(ground_truth_files, processed_files):
185-
station_name = os.path.basename(os.path.dirname(ground_truth_filepath))
186-
is_correct = _is_parquet_files_identical(ground_truth_filepath, processed_file_filepath)
187-
if not is_correct:
188-
raise Exception(
189-
f"Reader validation has failed for data_source '{data_source}', campaign_name '{campaign_name}'"
190-
f" and station_name '{station_name}'"
191-
)
152+
list_stations_info = available_stations(
153+
product="RAW",
154+
data_sources=None,
155+
campaign_names=None,
156+
return_tuple=True,
157+
base_dir=TEST_BASE_DIR,
158+
)
192159

193-
# Remove Processed directory if exists
194-
if os.path.exists(os.path.join(TEST_BASE_DIR, "Processed")):
160+
check_failed = False
161+
for data_source, campaign_name, station_name in list_stations_info:
195162
try:
196-
shutil.rmtree(os.path.join(TEST_BASE_DIR, "Processed"))
163+
_check_station_reader_results(
164+
base_dir=TEST_BASE_DIR,
165+
data_source=data_source,
166+
campaign_name=campaign_name,
167+
station_name=station_name,
168+
)
197169
except Exception:
198-
pass
170+
check_failed = True
171+
if check_failed:
172+
break
173+
174+
# Remove Processed directory if exists
175+
if os.path.exists(os.path.join(TEST_BASE_DIR, "Processed")):
176+
shutil.rmtree(os.path.join(TEST_BASE_DIR, "Processed"))

Diff for: disdrodb/l0/l0_processing.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
)
3939
from disdrodb.api.info import infer_path_info_dict
4040
from disdrodb.api.path import (
41+
define_campaign_dir,
4142
define_l0a_filepath,
4243
define_l0b_filepath,
4344
define_l0b_station_dir,
@@ -911,20 +912,18 @@ def run_l0a_station(
911912
station_name=station_name,
912913
)
913914
# Define campaign raw_dir and process_dir
914-
raw_dir = get_disdrodb_path(
915+
raw_dir = define_campaign_dir(
915916
base_dir=base_dir,
916917
product="RAW",
917918
data_source=data_source,
918919
campaign_name=campaign_name,
919920
)
920-
processed_dir = get_disdrodb_path(
921+
processed_dir = define_campaign_dir(
921922
base_dir=base_dir,
922-
product="L0A",
923+
product="L0A", # also works for raw netCDFs
923924
data_source=data_source,
924925
campaign_name=campaign_name,
925-
check_exists=False,
926926
)
927-
928927
# Run L0A processing
929928
# --> The reader call the run_l0a within the custom defined reader function
930929
# --> For the special case of raw netCDF data, it calls the run_l0b_from_nc function
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# This file is used to store dates to drop by the reader, the time format used is the isoformat (YYYY-mm-dd HH:MM:SS).
2+
# timestamp: list of timestamps
3+
# time_period: list of list ranges of dates
4+
# Example:
5+
# timestamp: ['2018-12-07 14:15','2018-12-07 14:17','2018-12-07 14:19', '2018-12-07 14:25']
6+
# time_period: [['2018-08-01 12:00:00', '2018-08-01 14:00:00'],
7+
# ['2018-08-01 15:44:30', '2018-08-01 15:59:31'],
8+
# ['2018-08-02 12:44:30', '2018-08-02 12:59:31']]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
data_source: UK
2+
campaign_name: DIVEN
3+
station_name: CAIRNGORM
4+
sensor_name: Thies_LPM
5+
reader: UK/DIVEN
6+
raw_data_format: netcdf
7+
platform_type: fixed
8+
source: NCAS Laser Precipitation Monitor
9+
source_convention: ''
10+
source_processing_date: ''
11+
title: cairngorm
12+
description: 1 minute precipitation characteristics output for a single day from a
13+
DiVeN disdrometer at Cairngorm
14+
project_name: Disdrometer Verification Network (DiVeN)
15+
keywords: ''
16+
summary: ''
17+
history: ''
18+
comment: ''
19+
station_id: '11'
20+
location: Cairngorm
21+
country: United Kingdom
22+
continent: Europe
23+
latitude: 57.0063
24+
longitude: -3.6628
25+
altitude: 781
26+
deployment_status: ''
27+
deployment mode: ''
28+
platform_protection: ''
29+
platform_orientation: ''
30+
sensor_long_name: Thies_LPM
31+
sensor_manufacturer: ''
32+
sensor_wavelength: 2143 mv mm-2
33+
sensor_serial_number: ''
34+
firmware_iop: ''
35+
firmware_dsp: ''
36+
firmware_version: ''
37+
sensor_beam_length: ''
38+
sensor_beam_width: ''
39+
sensor_nominal_width: ''
40+
measurement_interval: ''
41+
calibration_sensitivity: ''
42+
calibration_certification_date: ''
43+
calibration_certification_url: ''
44+
contributors: Ryan R. Neely III
45+
authors: Ben Pickering
46+
authors_url: ''
47+
contact: Ben Pickering
48+
contact_information: [email protected]
49+
acknowledgement: ''
50+
references: ''
51+
documentation: ''
52+
website: ''
53+
institution: National Centre for Atmospheric Science (NCAS)
54+
source_repository: https://orcid.org/0000-0002-8474-9005
55+
license: ''
56+
doi: ''
57+
disdrodb_data_url: ''

0 commit comments

Comments
 (0)