Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repack Nwb Files #1003

Draft
wants to merge 37 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
7304229
setup temp conversion script
pauladkisson Aug 12, 2024
4cc2a06
added from_existing_neurodata_object for hdf5
pauladkisson Aug 12, 2024
c33dfbf
added get_existing_dataset_io_configurations
pauladkisson Aug 12, 2024
80c1fba
added support for chunk_shape=None
pauladkisson Aug 13, 2024
7ee6fc6
added from_existing_nwbfile to HDF5BackendConfiguration
pauladkisson Aug 13, 2024
dacdeea
added get_existing_backend_configuration
pauladkisson Aug 13, 2024
dae04bf
added repack_nwbfile
pauladkisson Aug 13, 2024
4ac6e33
fixed bug with export options and hdmf.container.Container.set_data_io
pauladkisson Aug 14, 2024
ce267fb
refactored from_ methods
pauladkisson Aug 14, 2024
49f4262
template and changes optional
pauladkisson Aug 14, 2024
d93a5c5
added image series test
pauladkisson Aug 15, 2024
ab8b22f
Merge branch 'main' into repack
bendichter Aug 15, 2024
934bb3a
Merge branch 'main' into repack
pauladkisson Aug 15, 2024
1ad69ca
added initial test
pauladkisson Aug 15, 2024
04fb89c
updated signature to use file_path
pauladkisson Aug 16, 2024
6dab477
added test for trials table (fails)
pauladkisson Aug 16, 2024
e6d31a6
moved backend_configuration_changes to top of the fn
pauladkisson Aug 16, 2024
7252449
consolidated configure_and_export_nwbfile into configure_and_write_nw…
pauladkisson Aug 16, 2024
2ef5c44
parameterized for use_default_backend_configuration
pauladkisson Aug 16, 2024
80eb598
optional dci
pauladkisson Aug 19, 2024
433f8c9
added test for backend config changes
pauladkisson Aug 19, 2024
dd906ac
updated api to use boolean use_default flag instead of mode=existing
pauladkisson Aug 19, 2024
668cacc
added test for get_existing_backend_configuration
pauladkisson Aug 19, 2024
7796197
removed image_series test
pauladkisson Aug 19, 2024
b8a788c
added compressed trials table column
pauladkisson Aug 19, 2024
f631fb4
added test for get_existing_dataset_io.py
pauladkisson Aug 20, 2024
b089eb3
Merge branch 'main' into repack
pauladkisson Aug 20, 2024
c464764
added docstrings
pauladkisson Aug 20, 2024
1cf3629
used BACKEND_NWB_IO dict
pauladkisson Aug 20, 2024
481529f
added ZarrDatsetIOConfiguration.from_neurodata_object
pauladkisson Aug 20, 2024
1e6b119
Merge branch 'main' into repack
bendichter Aug 20, 2024
9f02b61
removed unnecessary indent
pauladkisson Aug 21, 2024
9ee146f
estimate buffer shape
pauladkisson Aug 21, 2024
ee7ec52
updated temp_test
pauladkisson Aug 21, 2024
a2145a1
added zarr to dataset_io tests
pauladkisson Aug 22, 2024
5785af0
added zarr to backend_configuration tests
pauladkisson Aug 22, 2024
b07c002
added zarr to repack_nwbfile tests
pauladkisson Aug 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
from typing import Any, Dict, Literal, Union

import h5py
from hdmf import Container
from pydantic import Field, InstanceOf
from typing_extensions import Self

from ._base_dataset_io import DatasetIOConfiguration
from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile
from ...importing import is_package_installed

_base_hdf5_filters = set(h5py.filters.decode)
Expand Down Expand Up @@ -78,3 +80,28 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts)

return dict(chunks=self.chunk_shape, **compression_bundle)

@classmethod
def from_existing_neurodata_object(
pauladkisson marked this conversation as resolved.
Show resolved Hide resolved
cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"]
) -> Self:
location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name)
full_shape = getattr(neurodata_object, dataset_name).shape
dtype = getattr(neurodata_object, dataset_name).dtype
chunk_shape = getattr(neurodata_object, dataset_name).chunks
buffer_shape = getattr(neurodata_object, dataset_name).maxshape
compression_method = getattr(neurodata_object, dataset_name).compression
compression_opts = getattr(neurodata_object, dataset_name).compression_opts
compression_options = dict(compression_opts=compression_opts)
pauladkisson marked this conversation as resolved.
Show resolved Hide resolved
return cls(
object_id=neurodata_object.object_id,
object_name=neurodata_object.name,
location_in_file=location_in_file,
dataset_name=dataset_name,
full_shape=full_shape,
dtype=dtype,
chunk_shape=chunk_shape,
buffer_shape=buffer_shape,
compression_method=compression_method,
compression_options=compression_options,
)
60 changes: 60 additions & 0 deletions src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,63 @@ def get_default_dataset_io_configurations(
)

yield dataset_io_configuration


def get_existing_dataset_io_configurations(
nwbfile: NWBFile,
backend: Literal["hdf5", "zarr"],
) -> Generator[DatasetIOConfiguration, None, None]:

DatasetIOConfigurationClass = DATASET_IO_CONFIGURATIONS[backend]

known_dataset_fields = ("data", "timestamps")
for neurodata_object in nwbfile.objects.values():
if isinstance(neurodata_object, DynamicTable):
dynamic_table = neurodata_object # For readability

for column in dynamic_table.columns:
candidate_dataset = column.data # VectorData object

# Skip over columns whose values are links, such as the 'group' of an ElectrodesTable
if any(isinstance(value, Container) for value in candidate_dataset):
continue # Skip

# Skip when columns whose values are a reference type
if isinstance(column, TimeSeriesReferenceVectorData):
continue

# Skip datasets with any zero-length axes
dataset_name = "data"
candidate_dataset = getattr(column, dataset_name)
full_shape = get_data_shape(data=candidate_dataset)
if any(axis_length == 0 for axis_length in full_shape):
continue

dataset_io_configuration = DatasetIOConfigurationClass.from_existing_neurodata_object(
neurodata_object=column, dataset_name=dataset_name
)

yield dataset_io_configuration
elif isinstance(neurodata_object, NWBContainer):
for known_dataset_field in known_dataset_fields:
# Skip optional fields that aren't present
if known_dataset_field not in neurodata_object.fields:
continue

candidate_dataset = getattr(neurodata_object, known_dataset_field)

# Skip edge case of in-memory ImageSeries with external mode; data is in fields and is empty array
if isinstance(candidate_dataset, np.ndarray) and candidate_dataset.size == 0:
continue

# Skip datasets with any zero-length axes
candidate_dataset = getattr(neurodata_object, known_dataset_field)
full_shape = get_data_shape(data=candidate_dataset)
if any(axis_length == 0 for axis_length in full_shape):
continue

dataset_io_configuration = DatasetIOConfigurationClass.from_existing_neurodata_object(
neurodata_object=neurodata_object, dataset_name=known_dataset_field
)

yield dataset_io_configuration
44 changes: 44 additions & 0 deletions temp_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
from pathlib import Path

import numpy as np
from pynwb import NWBHDF5IO, H5DataIO, TimeSeries
from pynwb.testing.mock.file import mock_NWBFile

from neuroconv.tools.nwb_helpers._dataset_configuration import (
get_existing_dataset_io_configurations,
)


def write_nwbfile(nwbfile_path: Path):
if nwbfile_path.exists():
os.remove(nwbfile_path)
nwbfile = mock_NWBFile()
timestamps = np.arange(10.0)
data = np.arange(100, 200, 10)
time_series_with_timestamps = TimeSeries(
name="test_timeseries",
description="an example time series",
data=H5DataIO(data=data, compression="gzip", chunks=(1,), compression_opts=2),
unit="m",
timestamps=H5DataIO(
timestamps, compression="gzip", chunks=(1,), compression_opts=2
), # TODO: add support for uncompressed timestamps
)
nwbfile.add_acquisition(time_series_with_timestamps)
with NWBHDF5IO(nwbfile_path, mode="w") as io:
io.write(nwbfile)


def main():
nwbfile_path = Path("/Volumes/T7/CatalystNeuro/temp.nwb")
write_nwbfile(nwbfile_path)
with NWBHDF5IO(nwbfile_path, mode="r") as io:
nwbfile = io.read()
existing_dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile, backend="hdf5")
for dataset_io_configuration in existing_dataset_io_configurations:
print(dataset_io_configuration)
pauladkisson marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == "__main__":
main()
Loading