catalystneuro · pauladkisson · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 13, 2024
diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py
@@ -6,6 +6,7 @@
 from ._backend_configuration import (
     BACKEND_CONFIGURATIONS,
     get_default_backend_configuration,
+    get_existing_backend_configuration,
 )
 from ._configuration_models import DATASET_IO_CONFIGURATIONS
 from ._configuration_models._base_backend import BackendConfiguration
@@ -30,6 +31,7 @@
     get_module,
     make_nwbfile_from_metadata,
     make_or_load_nwbfile,
+    repack_nwbfile,
 )
 
 __all__ = [
@@ -46,6 +48,7 @@
     "ZarrDatasetIOConfiguration",
     "get_default_backend_configuration",
     "get_default_dataset_io_configurations",
+    "get_existing_backend_configuration",
     "configure_backend",
     "get_default_dataset_io_configurations",
     "get_default_backend_configuration",
@@ -55,4 +58,5 @@
     "get_module",
     "make_nwbfile_from_metadata",
     "make_or_load_nwbfile",
+    "repack_nwbfile",
 ]
diff --git a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py
@@ -2,7 +2,8 @@
 
 from typing import Literal, Union
 
-from pynwb import NWBFile
+from hdmf_zarr import NWBZarrIO
+from pynwb import NWBHDF5IO, NWBFile
 
 from ._configuration_models._hdf5_backend import HDF5BackendConfiguration
 from ._configuration_models._zarr_backend import ZarrBackendConfiguration
@@ -17,3 +18,17 @@ def get_default_backend_configuration(
 
     BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
     return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile)
+
+
+def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]:
+    """Fill an existing backend configuration to serve as a starting point for further customization."""
+
+    read_io = nwbfile.read_io
+    if isinstance(read_io, NWBHDF5IO):
+        backend = "hdf5"
+    elif isinstance(read_io, NWBZarrIO):
+        backend = "zarr"
+    else:
+        raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.")
+    BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
+    return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile, mode="existing")
diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py
@@ -9,7 +9,10 @@
 
 from ._base_dataset_io import DatasetIOConfiguration
 from ._pydantic_pure_json_schema_generator import PureJSONSchemaGenerator
-from .._dataset_configuration import get_default_dataset_io_configurations
+from .._dataset_configuration import (
+    get_default_dataset_io_configurations,
+    get_existing_dataset_io_configurations,
+)
 
 
 class BackendConfiguration(BaseModel):
@@ -56,11 +59,16 @@ def model_json_schema(cls, **kwargs) -> Dict[str, Any]:
         return super().model_json_schema(mode="validation", schema_generator=PureJSONSchemaGenerator, **kwargs)
 
     @classmethod
-    def from_nwbfile(cls, nwbfile: NWBFile) -> Self:
-        default_dataset_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
+    def from_nwbfile(cls, nwbfile: NWBFile, mode: Literal["default", "existing"] = "default") -> Self:
+        if mode == "default":
+            dataset_io_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
+        elif mode == "existing":
+            dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
+        else:
+            raise ValueError(f"mode must be either 'default' or 'existing' but got {mode}")
         dataset_configurations = {
             default_dataset_configuration.location_in_file: default_dataset_configuration
-            for default_dataset_configuration in default_dataset_configurations
+            for default_dataset_configuration in dataset_io_configurations
         }
 
         return cls(dataset_configurations=dataset_configurations)

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
@@ -147,7 +147,6 @@ def __str__(self) -> str:
         """
         size_in_bytes = math.prod(self.full_shape) * self.dtype.itemsize
         maximum_ram_usage_per_iteration_in_bytes = math.prod(self.buffer_shape) * self.dtype.itemsize
-        disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize
 
         string = (
             f"\n{self.location_in_file}"
@@ -159,10 +158,14 @@ def __str__(self) -> str:
             f"\n  buffer shape : {self.buffer_shape}"
             f"\n  expected RAM usage : {human_readable_size(maximum_ram_usage_per_iteration_in_bytes)}"
             "\n"
-            f"\n  chunk shape : {self.chunk_shape}"
-            f"\n  disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}"
-            "\n"
         )
+        if self.chunk_shape is not None:
+            disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize
+            string += (
+                f"\n  chunk shape : {self.chunk_shape}"
+                f"\n  disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}"
+                "\n"
+            )
         if self.compression_method is not None:
             string += f"\n  compression method : {self.compression_method}"
         if self.compression_options is not None:
@@ -182,9 +185,9 @@ def validate_all_shapes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
             dataset_name == location_in_file.split("/")[-1]
         ), f"The `dataset_name` ({dataset_name}) does not match the end of the `location_in_file` ({location_in_file})!"
 
-        chunk_shape = values["chunk_shape"]
-        buffer_shape = values["buffer_shape"]
         full_shape = values["full_shape"]
+        chunk_shape = values["chunk_shape"] if values["chunk_shape"] is not None else full_shape
+        buffer_shape = values["buffer_shape"] if values["buffer_shape"] is not None else full_shape
 
         if len(chunk_shape) != len(buffer_shape):
             raise ValueError(

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py
@@ -3,9 +3,11 @@
 from typing import Any, Dict, Literal, Union
 
 import h5py
+from hdmf import Container
 from pydantic import Field, InstanceOf
+from typing_extensions import Self
 
-from ._base_dataset_io import DatasetIOConfiguration
+from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile
 from ...importing import is_package_installed
 
 _base_hdf5_filters = set(h5py.filters.decode)
@@ -78,3 +80,38 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
             compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts)
 
         return dict(chunks=self.chunk_shape, **compression_bundle)
+
+    @classmethod
+    def from_neurodata_object(
+        cls,
+        neurodata_object: Container,
+        dataset_name: Literal["data", "timestamps"],
+        mode: Literal["default", "existing"] = "default",
+    ) -> Self:
+        if mode == "default":
+            return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name)
+        elif mode == "existing":
+            location_in_file = _find_location_in_memory_nwbfile(
+                neurodata_object=neurodata_object, field_name=dataset_name
+            )
+            full_shape = getattr(neurodata_object, dataset_name).shape
+            dtype = getattr(neurodata_object, dataset_name).dtype
+            chunk_shape = getattr(neurodata_object, dataset_name).chunks
+            buffer_shape = getattr(neurodata_object, dataset_name).maxshape
+            compression_method = getattr(neurodata_object, dataset_name).compression
+            compression_opts = getattr(neurodata_object, dataset_name).compression_opts
+            compression_options = dict(compression_opts=compression_opts)
+            return cls(
+                object_id=neurodata_object.object_id,
+                object_name=neurodata_object.name,
+                location_in_file=location_in_file,
+                dataset_name=dataset_name,
+                full_shape=full_shape,
+                dtype=dtype,
+                chunk_shape=chunk_shape,
+                buffer_shape=buffer_shape,
+                compression_method=compression_method,
+                compression_options=compression_options,
+            )
+        else:
+            raise ValueError(f"mode must be either 'default' or 'existing' but got {mode}")
diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
@@ -172,3 +172,67 @@ def get_default_dataset_io_configurations(
                 )
 
                 yield dataset_io_configuration
+
+
+def get_existing_dataset_io_configurations(
+    nwbfile: NWBFile,
+    backend: Literal["hdf5", "zarr"],
+) -> Generator[DatasetIOConfiguration, None, None]:
+
+    DatasetIOConfigurationClass = DATASET_IO_CONFIGURATIONS[backend]
+
+    known_dataset_fields = ("data", "timestamps")
+    for neurodata_object in nwbfile.objects.values():
+        if isinstance(neurodata_object, DynamicTable):
+            dynamic_table = neurodata_object  # For readability
+
+            for column in dynamic_table.columns:
+                candidate_dataset = column.data  # VectorData object
+
+                # Skip over columns whose values are links, such as the 'group' of an ElectrodesTable
+                if any(isinstance(value, Container) for value in candidate_dataset):
+                    continue  # Skip
+
+                # Skip when columns whose values are a reference type
+                if isinstance(column, TimeSeriesReferenceVectorData):
+                    continue
+
+                # Skip datasets with any zero-length axes
+                dataset_name = "data"
+                candidate_dataset = getattr(column, dataset_name)
+                full_shape = get_data_shape(data=candidate_dataset)
+                if any(axis_length == 0 for axis_length in full_shape):
+                    continue
+
+                dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
+                    neurodata_object=column,
+                    dataset_name=dataset_name,
+                    mode="existing",
+                )
+
+                yield dataset_io_configuration
+        elif isinstance(neurodata_object, NWBContainer):
+            for known_dataset_field in known_dataset_fields:
+                # Skip optional fields that aren't present
+                if known_dataset_field not in neurodata_object.fields:
+                    continue
+
+                candidate_dataset = getattr(neurodata_object, known_dataset_field)
+
+                # Skip edge case of in-memory ImageSeries with external mode; data is in fields and is empty array
+                if isinstance(candidate_dataset, np.ndarray) and candidate_dataset.size == 0:
+                    continue
+
+                # Skip datasets with any zero-length axes
+                candidate_dataset = getattr(neurodata_object, known_dataset_field)
+                full_shape = get_data_shape(data=candidate_dataset)
+                if any(axis_length == 0 for axis_length in full_shape):
+                    continue
+
+                dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
+                    neurodata_object=neurodata_object,
+                    dataset_name=known_dataset_field,
+                    mode="existing",
+                )
+
+                yield dataset_io_configuration
diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py
@@ -15,7 +15,12 @@
 from pynwb import NWBHDF5IO, NWBFile
 from pynwb.file import Subject
 
-from . import BackendConfiguration, configure_backend, get_default_backend_configuration
+from . import (
+    BackendConfiguration,
+    configure_backend,
+    get_default_backend_configuration,
+    get_existing_backend_configuration,
+)
 from ...utils.dict import DeepDict, load_dict_from_file
 from ...utils.json_schema import validate_metadata
 
@@ -370,3 +375,51 @@ def configure_and_write_nwbfile(
 
     with IO(output_filepath, mode="w") as io:
         io.write(nwbfile)
+
+
+def configure_and_export_nwbfile(
+    nwbfile: NWBFile,
+    export_nwbfile_path: Path,
+    backend_configuration: BackendConfiguration,
+) -> None:
+    configure_backend(nwbfile=nwbfile, backend_configuration=backend_configuration)
+
+    IO = BACKEND_NWB_IO[backend_configuration.backend]
+    nwbfile.set_modified()
+    with IO(export_nwbfile_path, mode="w") as io:
+        io.export(nwbfile=nwbfile, src_io=nwbfile.read_io, write_args=dict(link_data=False))
+
+
+def repack_nwbfile(
+    *,
+    nwbfile: NWBFile,
+    export_nwbfile_path: Path,
+    template: Literal["existing", "default"] = "default",
+    backend_configuration_changes: dict = None,
+):
+    """Repack the NWBFile with the new backend configuration changes."""
+
+    if template == "existing":
+        backend_configuration = get_existing_backend_configuration(nwbfile=nwbfile)
+    elif template == "default":
+        read_io = nwbfile.read_io
+        if isinstance(read_io, NWBHDF5IO):
+            backend = "hdf5"
+        elif isinstance(read_io, NWBZarrIO):
+            backend = "zarr"
+        else:
+            raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.")
+        backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend=backend)
+    else:
+        raise ValueError(f"template must be either 'default' or 'existing' but got {template}")
+    dataset_configurations = backend_configuration.dataset_configurations
+
+    backend_configuration_changes = backend_configuration_changes or dict()
+    for neurodata_object_location, dataset_config_changes in backend_configuration_changes.items():
+        dataset_configuration = dataset_configurations[neurodata_object_location]
+        for dataset_config_key, dataset_config_value in dataset_config_changes.items():
+            setattr(dataset_configuration, dataset_config_key, dataset_config_value)
+
+    configure_and_export_nwbfile(
+        nwbfile=nwbfile, backend_configuration=backend_configuration, export_nwbfile_path=export_nwbfile_path
+    )
diff --git a/temp_test.py b/temp_test.py
@@ -0,0 +1,54 @@
+import os
+from pathlib import Path
+
+import numpy as np
+from pynwb import NWBHDF5IO, H5DataIO, TimeSeries
+from pynwb.testing.mock.file import mock_NWBFile
+
+from neuroconv.tools.nwb_helpers import (
+    repack_nwbfile,
+)
+
+
+def write_nwbfile(nwbfile_path: Path):
+    if nwbfile_path.exists():
+        os.remove(nwbfile_path)
+    nwbfile = mock_NWBFile()
+    timestamps = np.arange(10.0)
+    data = np.arange(100, 200, 10)
+    time_series_with_timestamps = TimeSeries(
+        name="test_timeseries",
+        description="an example time series",
+        data=H5DataIO(data=data, compression="gzip", chunks=(1,), compression_opts=2),
+        unit="m",
+        timestamps=timestamps,
+    )
+    nwbfile.add_acquisition(time_series_with_timestamps)
+    with NWBHDF5IO(nwbfile_path, mode="w") as io:
+        io.write(nwbfile)
+
+
+def main():
+    nwbfile_path = Path("/Volumes/T7/CatalystNeuro/temp.nwb")
+    repacked_nwbfile_path = Path("/Volumes/T7/CatalystNeuro/repacked_temp.nwb")
+    if repacked_nwbfile_path.exists():
+        os.remove(repacked_nwbfile_path)
+    if not nwbfile_path.exists():
+        write_nwbfile(nwbfile_path)
+    with NWBHDF5IO(nwbfile_path, mode="r") as io:
+        nwbfile = io.read()
+        backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))}
+        repack_nwbfile(
+            nwbfile=nwbfile,
+            export_nwbfile_path=repacked_nwbfile_path,
+            backend_configuration_changes=backend_configuration_changes,
+            template="existing",
+        )
+
+    with NWBHDF5IO(repacked_nwbfile_path, mode="r") as io:
+        nwbfile = io.read()
+        print(f'{nwbfile.acquisition["test_timeseries"].data.chunks = }')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/..._backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py b/..._backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py
@@ -8,6 +8,7 @@
 import pytest
 from hdmf_zarr import NWBZarrIO
 from pynwb import NWBHDF5IO, NWBFile
+from pynwb.image import ImageSeries
 from pynwb.testing.mock.base import mock_TimeSeries
 from pynwb.testing.mock.file import mock_NWBFile
 
@@ -262,3 +263,22 @@ def test_complex_zarr(zarr_nwbfile_path):
 
 """
     assert stdout.getvalue() == expected_print
+
+
+def test_000_ImageSeries():
+    nwbfile = mock_NWBFile()
+
+    im_series = ImageSeries(
+        name="my_video", external_file=["my_video.mp4"], starting_frame=[0], format="external", rate=30.0
+    )
+    nwbfile.add_acquisition(im_series)
+
+    with NWBHDF5IO("test.nwb", "w") as io:
+        io.write(nwbfile)
+
+    io = NWBHDF5IO("test.nwb", "r")
+    nwbfile = io.read()
+    print(nwbfile.acquisition["my_video"])
+
+    backend_config = get_default_backend_configuration(nwbfile, "hdf5")
+    print(backend_config)  # TODO: Figure out why this doesn't throw an error like Ben said it did