22
22
import shutil
23
23
24
24
import pandas as pd
25
+ import xarray as xr
25
26
26
27
from disdrodb import __root_path__
27
- from disdrodb .api .path import define_metadata_dir , get_disdrodb_path
28
- from disdrodb .l0 .l0_reader import get_station_reader_function
28
+ from disdrodb .api .io import available_stations
29
+ from disdrodb .api .path import define_campaign_dir , define_station_dir
30
+ from disdrodb .l0 .l0_processing import run_l0a_station
31
+ from disdrodb .metadata import read_station_metadata
29
32
from disdrodb .utils .directories import list_files
30
33
31
34
TEST_BASE_DIR = os .path .join (__root_path__ , "disdrodb" , "tests" , "data" , "check_readers" , "DISDRODB" )
32
35
33
36
34
- def _get_list_test_data_sources () -> list :
35
- """Get list of test data sources.
36
-
37
- Returns
38
- -------
39
- list
40
- List of test data sources.
41
- """
42
-
43
- data_sources = os .listdir (os .path .join (TEST_BASE_DIR , "Raw" ))
44
- return data_sources
45
-
46
-
47
- def _get_list_test_campaigns (data_source : str ) -> list :
48
- """Get list of test campaigns for a given data source.
49
-
50
- Parameters
51
- ----------
52
- data_source : str
53
- Data source.
54
-
55
- Returns
56
- -------
57
- list
58
- List of test campaigns.
59
-
60
- """
61
- campaign_names = os .listdir (os .path .join (TEST_BASE_DIR , "Raw" , data_source ))
62
- return campaign_names
63
-
64
-
65
- def _get_list_test_stations (data_source : str , campaign_name : str ) -> list :
66
- """Get list of test stations for a given data source and campaign.
37
+ def _check_identical_netcdf_files (file1 : str , file2 : str ) -> bool :
38
+ """Check if two L0B netCDF files are identical.
67
39
68
40
Parameters
69
41
----------
70
- data_source : str
71
- Data source.
72
-
73
- campaign_name : str
74
- Name of the campaign.
75
-
76
- Returns
77
- -------
78
- list
79
- List of test stations.
42
+ file1 : str
43
+ Path to the first file.
80
44
45
+ file2 : str
46
+ Path to the second file.
81
47
"""
82
- metadata_dir = define_metadata_dir (
83
- product = "RAW" ,
84
- base_dir = TEST_BASE_DIR ,
85
- data_source = data_source ,
86
- campaign_name = campaign_name ,
87
- check_exists = False ,
88
- )
89
- filepaths = list_files (metadata_dir , glob_pattern = "*.yml" , recursive = False )
90
- list_station_names = [os .path .splitext (os .path .basename (i ))[0 ] for i in filepaths ]
48
+ # Open files
49
+ ds1 = xr .open_dataset (file1 )
50
+ ds2 = xr .open_dataset (file2 )
51
+ # Remove attributes that depends on processing time
52
+ ds1 .attrs .pop ("disdrodb_processing_date" , None )
53
+ ds2 .attrs .pop ("disdrodb_processing_date" , None )
54
+ # Assert equality
55
+ xr .testing .assert_identical (ds1 , ds2 )
91
56
92
- return list_station_names
93
57
94
-
95
- def _is_parquet_files_identical (file1 : str , file2 : str ) -> bool :
58
+ def _check_identical_parquet_files (file1 : str , file2 : str ) -> bool :
96
59
"""Check if two parquet files are identical.
97
60
98
61
Parameters
@@ -102,64 +65,79 @@ def _is_parquet_files_identical(file1: str, file2: str) -> bool:
102
65
103
66
file2 : str
104
67
Path to the second file.
105
-
106
- Returns
107
- -------
108
- bool
109
- True if the two files are identical, False otherwise.
110
-
111
68
"""
112
69
df1 = pd .read_parquet (file1 )
113
70
df2 = pd .read_parquet (file2 )
114
- return df1 .equals (df2 )
71
+ if not df1 .equals (df2 ):
72
+ raise ValueError ("The two Parquet files differ." )
115
73
116
74
117
- def _run_reader_on_test_data (data_source : str , campaign_name : str ) -> None :
118
- """Run reader over the test data sample.
75
+ def _check_station_reader_results (
76
+ base_dir ,
77
+ data_source ,
78
+ campaign_name ,
79
+ station_name ,
80
+ ):
81
+ raw_dir = define_campaign_dir (
82
+ base_dir = TEST_BASE_DIR ,
83
+ product = "RAW" ,
84
+ data_source = data_source ,
85
+ campaign_name = campaign_name ,
86
+ )
119
87
120
- Parameters
121
- ----------
122
- data_source : str
123
- Data source.
124
- campaign_name : str
125
- Campaign name.
126
- """
127
- station_names = _get_list_test_stations (data_source = data_source , campaign_name = campaign_name )
128
- for station_name in station_names :
129
- reader = get_station_reader_function (
130
- base_dir = TEST_BASE_DIR ,
131
- data_source = data_source ,
132
- campaign_name = campaign_name ,
133
- station_name = station_name ,
134
- )
135
-
136
- # Define campaign_name raw_dir and process_dir
137
- raw_dir = get_disdrodb_path (
138
- base_dir = TEST_BASE_DIR ,
139
- product = "RAW" ,
140
- data_source = data_source ,
141
- campaign_name = campaign_name ,
142
- )
143
-
144
- processed_dir = get_disdrodb_path (
145
- base_dir = TEST_BASE_DIR ,
146
- product = "L0A" ,
147
- data_source = data_source ,
148
- campaign_name = campaign_name ,
149
- check_exists = False ,
150
- )
151
- # Call the reader
152
- reader (
153
- raw_dir = raw_dir ,
154
- processed_dir = processed_dir ,
155
- station_name = station_name ,
156
- force = True ,
157
- verbose = False ,
158
- debugging_mode = False ,
159
- parallel = False ,
160
- )
161
-
162
- return processed_dir
88
+ run_l0a_station (
89
+ base_dir = TEST_BASE_DIR ,
90
+ data_source = data_source ,
91
+ campaign_name = campaign_name ,
92
+ station_name = station_name ,
93
+ force = True ,
94
+ verbose = False ,
95
+ debugging_mode = False ,
96
+ parallel = False ,
97
+ )
98
+
99
+ metadata = read_station_metadata (
100
+ base_dir = TEST_BASE_DIR ,
101
+ product = "L0A" ,
102
+ data_source = data_source ,
103
+ campaign_name = campaign_name ,
104
+ station_name = station_name ,
105
+ )
106
+ raw_data_format = metadata ["raw_data_format" ]
107
+ if raw_data_format == "netcdf" :
108
+ glob_pattern = "*.nc"
109
+ check_identical_files = _check_identical_netcdf_files
110
+ product = "L0B"
111
+ else : # raw_data_format == "txt"
112
+ glob_pattern = "*.parquet"
113
+ check_identical_files = _check_identical_parquet_files
114
+ product = "L0A"
115
+
116
+ ground_truth_station_dir = os .path .join (raw_dir , "ground_truth" , station_name )
117
+ processed_station_dir = define_station_dir (
118
+ base_dir = TEST_BASE_DIR ,
119
+ product = product ,
120
+ data_source = data_source ,
121
+ campaign_name = campaign_name ,
122
+ station_name = station_name ,
123
+ )
124
+
125
+ # Retrieve files
126
+ ground_truth_files = sorted (list_files (ground_truth_station_dir , glob_pattern = glob_pattern , recursive = True ))
127
+ processed_files = sorted (list_files (processed_station_dir , glob_pattern = glob_pattern , recursive = True ))
128
+
129
+ # Check same number of files
130
+ n_groud_truth = len (ground_truth_files )
131
+ n_processed = len (processed_files )
132
+ if n_groud_truth != n_processed :
133
+ raise ValueError (f"{ n_groud_truth } ground truth files but only { n_processed } are prfoduced." )
134
+
135
+ # Compare equality of files
136
+ for ground_truth_filepath , processed_filepath in zip (ground_truth_files , processed_files ):
137
+ try :
138
+ check_identical_files (ground_truth_filepath , processed_filepath )
139
+ except Exception :
140
+ raise ValueError (f"Reader validation has failed for '{ data_source } ' '{ campaign_name } ' '{ station_name } '" )
163
141
164
142
165
143
def check_all_readers () -> None :
@@ -171,28 +149,28 @@ def check_all_readers() -> None:
171
149
If the reader validation has failed.
172
150
"""
173
151
174
- for data_source in _get_list_test_data_sources ():
175
- for campaign_name in _get_list_test_campaigns (data_source ):
176
- process_dir = _run_reader_on_test_data (data_source , campaign_name )
177
- ground_truth_dir = os .path .join (TEST_BASE_DIR , "Raw" , data_source , campaign_name , "ground_truth" )
178
- processed_product_dir = os .path .join (process_dir , "L0A" )
179
-
180
- glob_pattern = os .path .join ("*" , "*.parquet" )
181
- ground_truth_files = list_files (ground_truth_dir , glob_pattern = glob_pattern , recursive = False )
182
- processed_files = list_files (processed_product_dir , glob_pattern = glob_pattern , recursive = False )
183
-
184
- for ground_truth_filepath , processed_file_filepath in zip (ground_truth_files , processed_files ):
185
- station_name = os .path .basename (os .path .dirname (ground_truth_filepath ))
186
- is_correct = _is_parquet_files_identical (ground_truth_filepath , processed_file_filepath )
187
- if not is_correct :
188
- raise Exception (
189
- f"Reader validation has failed for data_source '{ data_source } ', campaign_name '{ campaign_name } '"
190
- f" and station_name '{ station_name } '"
191
- )
152
+ list_stations_info = available_stations (
153
+ product = "RAW" ,
154
+ data_sources = None ,
155
+ campaign_names = None ,
156
+ return_tuple = True ,
157
+ base_dir = TEST_BASE_DIR ,
158
+ )
192
159
193
- # Remove Processed directory if exists
194
- if os . path . exists ( os . path . join ( TEST_BASE_DIR , "Processed" )) :
160
+ check_failed = False
161
+ for data_source , campaign_name , station_name in list_stations_info :
195
162
try :
196
- shutil .rmtree (os .path .join (TEST_BASE_DIR , "Processed" ))
163
+ _check_station_reader_results (
164
+ base_dir = TEST_BASE_DIR ,
165
+ data_source = data_source ,
166
+ campaign_name = campaign_name ,
167
+ station_name = station_name ,
168
+ )
197
169
except Exception :
198
- pass
170
+ check_failed = True
171
+ if check_failed :
172
+ break
173
+
174
+ # Remove Processed directory if exists
175
+ if os .path .exists (os .path .join (TEST_BASE_DIR , "Processed" )):
176
+ shutil .rmtree (os .path .join (TEST_BASE_DIR , "Processed" ))
0 commit comments