From 054581cc41c80b047b488ba86eb88eae5fef5e51 Mon Sep 17 00:00:00 2001
From: Robert Bartel <robert.bartel@noaa.gov>
Date: Tue, 2 Jul 2024 11:34:30 -0400
Subject: [PATCH] Optimize object store manager using archiving.

Take advantage of minio archiving feature when adding data to empty,
read-only dataset (i.e., adding initial data that will not be changed),
since lots of files in minio bucket has significant overhead.
---
 .../modeldata/data/object_store_manager.py    | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
index 8b3097e23..f48673b71 100644
--- a/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
+++ b/python/lib/modeldata/dmod/modeldata/data/object_store_manager.py
@@ -1,11 +1,12 @@
 import io
 import json
 import logging
+import tempfile
 
 import minio.retention
 
 from dmod.core.meta_data import DataCategory, DataDomain
-from dmod.core.dataset import Dataset, DatasetManager, DatasetType, InitialDataAdder
+from dmod.core.dataset import DataArchiving, Dataset, DatasetManager, DatasetType, InitialDataAdder
 from dmod.core.common.reader import Reader
 from datetime import datetime, timedelta
 from minio import Minio
@@ -14,6 +15,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Set, Tuple, Union
 from uuid import UUID
+from zipfile import ZipFile
 
 
 class ObjectStoreDatasetManager(DatasetManager):
@@ -245,6 +247,14 @@ def add_data(self, dataset_name: str, dest: str, domain: DataDomain, data: Optio
         ::method:`_push_file`
         ::method:`_push_files`
         """
+        # Prevent adding to read-only dataset except when first setting it up
+        if self.datasets[dataset_name].is_read_only:
+            ds_files = [f for f in self.list_files(dataset_name) if f != self.get_serial_dataset_filename(dataset_name)]
+            if len(ds_files) != 0:
+                logging.error(f"{self.__class__.__name__} can't add data to read-only dataset except when it is empty "
+                              f"and initializing")
+                return False
+
         if dataset_name not in self.datasets:
             return False
 
@@ -292,8 +302,22 @@ def add_data(self, dataset_name: str, dest: str, domain: DataDomain, data: Optio
             msg = "{}.{} source path '{}' does not exist."
             raise ValueError(msg.format(self.__class__.__name__, _getframe(0).f_code.co_name, source))
         elif src_path.is_dir():
-            bucket_root = kwargs.get('bucket_root', src_path)
-            self._push_files(bucket_name=dataset_name, dir_path=src_path, bucket_root=bucket_root)
+            # Recognized scenario when we may want to take advantage of small file archives
+            # (see https://blog.min.io/small-file-archives/)
+            # Also, we already know from above that, if read-only, dataset must also be empty
+            # TODO: (later) consider whether there is some minimum file count (perhaps configurable) to also consider
+            if self.datasets[dataset_name].is_read_only:
+                self.datasets[dataset_name].data_archiving = DataArchiving.ZIP_0
+                # Combine all the files in that directory into an uncompressed zip archive
+                with tempfile.TemporaryDirectory() as zip_dest_dir_name:
+                    archive_path = Path(f"{zip_dest_dir_name}/{self.datasets[dataset_name].archive_name}")
+                    with ZipFile(archive_path, "w") as archive:
+                        for f in src_path.glob("*"):
+                            archive.write(filename=str(f), arcname=f.name)
+                    self._push_file(bucket_name=dataset_name, file=archive_path)
+            else:
+                self._push_files(bucket_name=dataset_name, dir_path=src_path,
+                                 bucket_root=kwargs.get('bucket_root', src_path))
             self.datasets[dataset_name].data_domain = updated_domain
             self.persist_serialized(dataset_name)
             return True