Optimize BmiAutoGenerationAdder IO performance.

Modifying add_initial_data to write all BMI configs at once to a temporary directory so that files can be added to the dataset all at once, allowing any optimizations available to the manager implementation to be used.
NOAA-OWP · Jul 2, 2024 · 158808f · 158808f
1 parent 6fcb923
commit 158808f
Showing 1 changed file with 39 additions and 6 deletions.
diff --git a/python/services/dataservice/dmod/dataservice/initial_data_adder_impl.py b/python/services/dataservice/dmod/dataservice/initial_data_adder_impl.py
@@ -1,7 +1,10 @@
+from __future__ import annotations
+
 import json
 import pandas as pd
 import geopandas as gpd
 import io
+import tempfile
 from dmod.communication import AbstractNgenRequest, NgenCalibrationRequest
 from dmod.communication.maas_request.ngen.partial_realization_config import PartialRealizationConfig
 from dmod.core.dataset import Dataset, DatasetManager, InitialDataAdder
@@ -503,6 +506,28 @@ def __init__(self,
                  realization_cfg_file_name: str,
                  noah_owp_params_dir: Optional[Union[str, Path]] = None,
                  catchment_subset: Optional[Set[str]] = None):
+        """
+        Initialize an instance using datasets (and certain specific metadata) for hydrofabric and realization config.
+
+        Parameters
+        ----------
+        hydrofabric_dataset
+            The dataset for the hydrofabric used for generation.
+        hydrofabric_geopackage_file_name
+            The name of the primary hydrofabric geopackage file within the hydrofabric dataset.
+        hydrofabric_model_attributes_file_name
+            The name of the primary hydrofabric model attributes file within the hydrofabric dataset.
+        realization_config_dataset
+            The dataset for the ngen realization config use for generation.
+        realization_cfg_file_name
+            The name of the main realization config file within its dataset.
+        noah_owp_params_dir
+            An optional path to a directory containing Noah-OWP-modular parameter data, if Noah-OWP-modular configs are
+            to be generated.
+        catchment_subset
+            An optional subset of catchments for which to generate BMI init configs, instead of all those having
+            formulations implicitly (i.e., via the global formulation) or explicitly within the realization config.
+        """
         def load_from_dataset(ds: Dataset, item_name: str) -> bytes:
             return ds.manager.get_data(dataset_name=ds.name, item_name=item_name)
 
@@ -556,9 +581,17 @@ def add_initial_data(self):
         DmodRuntimeError
             Raised when initial data could not be assembled and/or added successfully to the dataset.
         """
-        for cat_id, config in self._bmi_generator.generate_configs():
-            item = f"{config.__class__.__name__}_{cat_id}.{_get_file_extension(config)}"
-            data = self._serialize(config)
-            # TODO: (later) see if we can take advantage of threading somehow here for the IO
-            if not self._dataset_manager.add_data(dataset_name=self._dataset_name, dest=item, data=data):
-                raise DmodRuntimeError(f"{self.__class__.__name__} failed to add generated BMI init config {item}")
+        dataset: Dataset = self._dataset_manager.datasets[self._dataset_name]
+
+        # We should be able to get away with just maintaining the same domain at this stage for this type of data
+        # Supplying this to add_data() eventually gets to the merge stage, where equal domains just do nothing
+        initial_domain = dataset.data_domain
+
+        # Write all files, then send all at once to add_data to potentially take advantage of optimizations
+        # Get temp directory to write to
+        with tempfile.TemporaryDirectory() as temp_dir_name:
+            self._bmi_generator.write_configs(output_dir=temp_dir_name)
+            # Pass everything to add_data() at once and hope for implementation-specific optimizations
+            if not self._dataset_manager.add_data(dataset_name=self._dataset_name, dest='', source=temp_dir_name,
+                                                  domain=initial_domain):
+                raise DmodRuntimeError(f"{self.__class__.__name__} failed to add generated BMI init configs")