NOAA-OWP · EmilyDeardorff · Dec 16, 2024 · Dec 16, 2024 · Dec 17, 2024 · Dec 18, 2024
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,6 +1,24 @@
 All notable changes to this project will be documented in this file.
 We follow the [Semantic Versioning 2.0.0](http://semver.org/) format.
 
+## v4.5.___ - 2025-___ - [PR#1379](https://github.com/NOAA-OWP/inundation-mapping/pull/1379)
+
+There are many sites in non-CONUS regions (AK, PR, HI) where we would like to run CatFIM but they are being excluded because they are not NWM forecast points. This update brings back the double API pull and adds in some code to filter out duplicate (and NULL) lids from the metadata lists. 
+
+### Additions
+- `inundation-mapping/tools/catfim/vis_categorical_fim.py`: Functions for reading in, processing, and visualizing CatFIM results. 
+-  `inundation-mapping/tools/catfim/notebooks/vis_catfim_cross_section.ipynb`: A new Jupyter notebook for viewing and analyzing CatFIM results.
+- `inundation-mapping/tools/catfim/notebooks/eval_catfim_metadata.ipynb`: A new Jupyter notebook for evaluating metadata and results from CatFIM runs. 
+- `inundation-mapping/tools/catfim/notebooks/.ipynb_checkpoints/vis_catfim_cross_section-checkpoint.ipynb`: Checkpoint file.
+- `inundation-mapping/tools/catfim/notebooks/.ipynb_checkpoints/eval_catfim_metadata-checkpoint.ipynb`: Checkpoint file.
+
+### Changes
+
+- `inundation-mapping/tools/catfim/generate_categorical_fim_flows.py`: Re-implements the dual API call and filters out duplicate sites.
+
+
+<br/><br/>
+
 ## v4.5.13.1 - 2024-12-13 - [PR#1361](https://github.com/NOAA-OWP/inundation-mapping/pull/1361)
 
 This PR was triggered by two dep-bot PR's. One for Tornado, one for aiohttp. Upon further research, these two exist only as dependencies for Jupyter and Jupyterlab which were very out of date. Upgrading Jupyter/JupyterLab took care of the other two.

diff --git a/tools/catfim/generate_categorical_fim_flows.py b/tools/catfim/generate_categorical_fim_flows.py
@@ -646,7 +646,7 @@ def __load_nwm_metadata(
 
     FLOG.trace(metadata_url)
 
-    all_meta_lists = []
+    output_meta_list = []
     # Check to see if meta file already exists
     # This feature means we can copy the pickle file to another enviro (AWS?) as it won't need to call
     # WRDS unless we need a smaller or modified version. This one likely has all nws_lid data.
@@ -655,40 +655,31 @@ def __load_nwm_metadata(
         FLOG.lprint(f"Meta file already downloaded and exists at {nwm_metafile}")
 
         with open(nwm_metafile, "rb") as p_handle:
-            all_meta_lists = pickle.load(p_handle)
+            output_meta_list = pickle.load(p_handle)
 
     else:
         meta_file = os.path.join(output_catfim_dir, "nwm_metafile.pkl")
 
         FLOG.lprint(f"Meta file will be downloaded and saved at {meta_file}")
 
-        if lid_to_run != "all":
-            # single lid for now
-
-            # must_include_value variable not yet tested
-            # must_include_value = 'nws_data.rfc_forecast_point' if lid_to_run not in ['HI', 'PR', 'AK'] else None
-            all_meta_lists, ___ = get_metadata(
+        if lid_to_run != "all":  # TODO: Deprecate LID options (in favor of HUC list functionlity)
+            # Single lid for now (deprecated)
+            output_meta_list, ___ = get_metadata(
                 metadata_url,
                 select_by='nws_lid',
                 selector=[lid_to_run],
                 must_include='nws_data.rfc_forecast_point',
                 upstream_trace_distance=nwm_us_search,
                 downstream_trace_distance=nwm_ds_search,
             )
-        else:
-            # This gets all records including AK, HI and PR, but only if they ahve forecast points
 
-            # Note: Nov 2024: AK has 152 sites with forecast points, but after research
-            # non of the AK sites that nws_data.rfc_forecast_point = False so we can
-            # exclude them.
-            # Previously we allowed HI and PR sites to come in and most were failing.
-            # So, we will include HI and PR as well here
-
-            # We can not just filter out based on dup lids as depending on which
-            # metadata load they are on, dup lid records will have different data
+        else:
+            # Dec 2024: Running two API calls: one to get all forecast points, and another
+            # to get all points (non-forecast and forecast) for the OCONUS regions. Then,
+            # duplicate LIDs are removed.
 
-            # orig_meta_lists, ___ = get_metadata(
-            all_meta_lists, ___ = get_metadata(
+            # Get all forecast points
+            forecast_point_meta_list, ___ = get_metadata(
                 metadata_url,
                 select_by='nws_lid',
                 selector=['all'],
@@ -697,41 +688,53 @@ def __load_nwm_metadata(
                 downstream_trace_distance=nwm_ds_search,
             )
 
-            # If we decided to put HI and PR back in we can do two loads one
-            # with the flag and one without. Then iterate through the meta_lists
-            # results and filtered out based on the state full value. Then
-            # call WRDS again with those specific states and simply concat them.
+            # Get all points for OCONUS regions (HI, PR, and AK)
+            oconus_meta_list, ___ = get_metadata(
+                metadata_url,
+                select_by='state',
+                selector=['HI', 'PR', 'AK'],
+                must_include=None,
+                upstream_trace_distance=nwm_us_search,
+                downstream_trace_distance=nwm_ds_search,
+            )
+
+            # Append the lists
+            unfiltered_meta_list = forecast_point_meta_list + oconus_meta_list
 
-            # filtered_meta_data = []
-            # for metadata in orig_meta_lists:
-            #     df = pd.json_normalize(metadata)
-            #     state = df['nws_data.state'].item()
-            #     lid = df['identifiers.nws_lid'].item()
-            #     if state.lower() not in ["alaska", "hawaii", "puerto rico"]:
-            #         filtered_meta_data.append(metadata)
+            # print(f"len(all_meta_lists) is {len(all_meta_lists)}")
 
-            # must_include='nws_data.rfc_forecast_point',
+            # Filter the metadata list
+            output_meta_list = []
+            unique_lids, duplicate_lids = [], []  # TODO: remove?
+            duplicate_meta_list = []
+            nonelid_metadata_list = []  # TODO: remove
 
-            # Nov 2024: We used to call them site specific and may add them back in but ok to leave then
+            for i, site in enumerate(unfiltered_meta_list):
+                nws_lid = site['identifiers']['nws_lid']
 
-            # islands_list, ___ = get_metadata(
-            #     metadata_url,
-            #     select_by='state',
-            #     selector=['HI', 'PR'],
-            #     must_include=None,
-            #     upstream_trace_distance=nwm_us_search,
-            #     downstream_trace_distance=nwm_ds_search,
-            # )
+                if nws_lid is None:
+                    # No LID available
+                    nonelid_metadata_list.append(site)  # TODO: replace with Continue
 
-            #  Append the lists
-            # all_meta_lists = filtered_all_meta_list + islands_list
+                elif nws_lid in unique_lids:
+                    # Duplicate LID
+                    duplicate_lids.append(nws_lid)
+                    duplicate_meta_list.append(site)  # TODO: remove extra lists
 
-            # print(f"len(all_meta_lists) is {len(all_meta_lists)}")
+                else:
+                    # Unique/unseen LID that's not None
+                    unique_lids.append(nws_lid)
+                    output_meta_list.append(site)
+
+            FLOG.lprint(f'{len(duplicate_lids)} duplicate points removed.')
+            FLOG.lprint(f'Filtered metadatada downloaded for {len(output_meta_list)} points.')
+
+        # ----------
 
         with open(meta_file, "wb") as p_handle:
-            pickle.dump(all_meta_lists, p_handle, protocol=pickle.HIGHEST_PROTOCOL)
+            pickle.dump(output_meta_list, p_handle, protocol=pickle.HIGHEST_PROTOCOL)
 
-    return all_meta_lists
+    return output_meta_list
 
 
 if __name__ == '__main__':