Merge pull request #180 from rlizzo/metadata-columns

Column API and DataType Containers
tensorwerk · Mar 4, 2020 · fc68f58 · fc68f58
2 parents 698a792 + d126d12
commit fc68f58
Show file tree

Hide file tree

Showing 104 changed files with 8,231 additions and 6,208 deletions.
diff --git a/.github/workflows/asvbench.yml b/.github/workflows/asvbench.yml
@@ -6,7 +6,7 @@ on:
     - master
 
 jobs:
-  build:
+  run_benchmarks:
     runs-on: ${{ matrix.os }}
     strategy:
       max-parallel: 4
@@ -23,7 +23,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install --upgrade setuptools virtualenv
+        pip install --upgrade setuptools
+        pip install virtualenv==16.7.9
         pip install git+https://github.com/airspeed-velocity/asv
     - name: Run Benchmarks
       run: |

diff --git a/.github/workflows/toxtest.yml b/.github/workflows/toxtest.yml
@@ -0,0 +1,36 @@
+name: Run Test Suite
+
+on:
+  pull_request:
+    branches:
+    - master
+
+
+jobs:
+  run_test_suite:
+    runs-on: ${{ matrix.platform }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # https://help.github.com/articles/virtual-environments-for-github-actions
+        platform:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
+        python-version: [3.6, 3.7]
+
+    steps:
+    - uses: actions/checkout@v1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade setuptools pip wheel
+        # Use the latest published version for myself :)
+        python -m pip install tox-gh-actions
+    - name: Test with tox
+      run: tox -- -p no:sugar
+      env:
+        PYTEST_XDIST_PROC_NR: 2
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,15 @@ Change Log
 Improvements
 ------------
 
+* Column and backend classes are now fully serializable (pickleable) for ``read-only`` checkouts.
+  (`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
+* Modularized internal structure of API classes to easily allow new columnn layouts / data types
+  to be added in the future.
+  (`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
+* Improved type / value checking of manual specification for column ``backend`` and ``backend_options``.
+  (`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
+* Standardized column data access API to follow python standard library ``dict`` methods API.
+  (`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
 * Memory usage of arrayset checkouts has been reduced by ~70% by using C-structs for allocating
   sample record locating info.
   (`#179 <https://github.com/tensorwerk/hangar-py/pull/179>`__) `@rlizzo <https://github.com/rlizzo>`__
@@ -22,6 +31,10 @@ Improvements
 New Features
 ------------
 
+* "string" type columns now supported alongside "ndarray" column type.
+  (`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
+* New "column" API, which replaces "arrayset" name.
+  (`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
 * Arraysets can now contain "nested subsamples" under a common sample key.
   (`#179 <https://github.com/tensorwerk/hangar-py/pull/179>`__) `@rlizzo <https://github.com/rlizzo>`__
 * New API to add and remove samples from and arrayset.

diff --git a/asv_bench/benchmarks/backend_comparisons.py b/asv_bench/benchmarks/backend_comparisons.py
@@ -15,7 +15,7 @@ class _WriterSuite:
 
     params = ['hdf5_00', 'hdf5_01', 'numpy_10']
     param_names = ['backend']
-    processes = 1
+    processes = 2
     repeat = (2, 4, 30.0)
     # repeat == tuple (min_repeat, max_repeat, max_time)
     number = 2
@@ -59,6 +59,9 @@ def setup(self, backend):
                 raise NotImplementedError
         except ValueError:
             raise NotImplementedError
+        except AttributeError:
+            self.aset = self.co.add_ndarray_column(
+                'aset', prototype=self.arr, backend=self.backend_code[backend])
 
     def teardown(self, backend):
         self.co.close()
@@ -91,7 +94,7 @@ class _ReaderSuite:
 
     params = ['hdf5_00', 'hdf5_01', 'numpy_10']
     param_names = ['backend']
-    processes = 1
+    processes = 2
     repeat = (2, 4, 30.0)
     # repeat == tuple (min_repeat, max_repeat, max_time)
     number = 3
@@ -134,8 +137,15 @@ def setup_cache(self):
                     pass
             except ValueError:
                 pass
+            except AttributeError:
+                co.add_ndarray_column(backend, prototype=arr, backend=code)
 
-        with co.arraysets as asets_cm:
+        try:
+            col = co.columns
+        except AttributeError:
+            col = co.arraysets
+
+        with col as asets_cm:
             for aset in asets_cm.values():
                 changer = 0
                 for i in range(num_samples):
@@ -150,7 +160,10 @@ def setup(self, backend):
         self.repo = Repository(path=os.getcwd(), exists=True)
         self.co = self.repo.checkout(write=False)
         try:
-            self.aset = self.co.arraysets[backend]
+            try:
+                self.aset = self.co.columns[backend]
+            except AttributeError:
+                self.aset = self.co.arraysets[backend]
         except KeyError:
             raise NotImplementedError
 
@@ -167,4 +180,4 @@ def read(self, backend):
 class Read_50by50by10_3000_samples(_ReaderSuite):
     method = 'read'
     num_samples = 3000
-    time_read = _ReaderSuite.read
+    time_read = _ReaderSuite.read
diff --git a/asv_bench/benchmarks/backends/hdf5_00.py b/asv_bench/benchmarks/backends/hdf5_00.py
@@ -9,7 +9,7 @@
 
 class _WriterSuite_HDF5_00:
 
-    processes = 1
+    processes = 2
     repeat = (2, 4, 20.0)
     # repeat == tuple (min_repeat, max_repeat, max_time)
     number = 2
@@ -42,6 +42,8 @@ def setup(self):
         except ValueError:
             # marks as skipped benchmark for commits which do not have this backend.
             raise NotImplementedError
+        except AttributeError:
+            self.aset = self.co.add_ndarray_column('aset', prototype=arr, backend='00')
 
         if self.method == 'read':
             with self.aset as cm_aset:
@@ -51,7 +53,10 @@ def setup(self):
             self.co.commit('first commit')
             self.co.close()
             self.co = self.repo.checkout(write=False)
-            self.aset = self.co.arraysets['aset']
+            try:
+                self.aset = self.co.columns['aset']
+            except AttributeError:
+                self.aset = self.co.arraysets['aset']
         else:
             self.arr = arr
 
@@ -117,4 +122,4 @@ class Read_50by50by10_300_samples(_WriterSuite_HDF5_00):
 
     time_read = _WriterSuite_HDF5_00.read
     track_repo_size = _WriterSuite_HDF5_00.size
-    track_repo_size.unit = 'bytes'
+    track_repo_size.unit = 'bytes'
diff --git a/asv_bench/benchmarks/backends/hdf5_01.py b/asv_bench/benchmarks/backends/hdf5_01.py
@@ -9,7 +9,7 @@
 
 class _WriterSuite_HDF5_01:
 
-    processes = 1
+    processes = 2
     repeat = (2, 4, 20.0)
     # repeat == tuple (min_repeat, max_repeat, max_time)
     number = 2
@@ -45,6 +45,8 @@ def setup(self):
         except ValueError:
             # marks as skipped benchmark for commits which do not have this backend.
             raise NotImplementedError
+        except AttributeError:
+            self.aset = self.co.add_ndarray_column('aset', prototype=arr, backend='01')
 
         if self.method == 'read':
             with self.aset as cm_aset:
@@ -54,7 +56,10 @@ def setup(self):
             self.co.commit('first commit')
             self.co.close()
             self.co = self.repo.checkout(write=False)
-            self.aset = self.co.arraysets['aset']
+            try:
+                self.aset = self.co.columns['aset']
+            except AttributeError:
+                self.aset = self.co.arraysets['aset']
         else:
             self.arr = arr
 
@@ -120,4 +125,4 @@ class Read_50by50by10_300_samples(_WriterSuite_HDF5_01):
 
     time_read = _WriterSuite_HDF5_01.read
     track_repo_size = _WriterSuite_HDF5_01.size
-    track_repo_size.unit = 'bytes'
+    track_repo_size.unit = 'bytes'
diff --git a/asv_bench/benchmarks/backends/numpy_10.py b/asv_bench/benchmarks/backends/numpy_10.py
@@ -9,7 +9,7 @@
 
 class _WriterSuite_NUMPY_10:
 
-    processes = 1
+    processes = 2
     repeat = (2, 4, 20.0)
     # repeat == tuple (min_repeat, max_repeat, max_time)
     number = 2
@@ -42,6 +42,8 @@ def setup(self):
         except ValueError:
             # marks as skipped benchmark for commits which do not have this backend.
             raise NotImplementedError
+        except AttributeError:
+            self.aset = self.co.add_ndarray_column('aset', prototype=arr, backend='10')
 
         if self.method == 'read':
             with self.aset as cm_aset:
@@ -51,7 +53,10 @@ def setup(self):
             self.co.commit('first commit')
             self.co.close()
             self.co = self.repo.checkout(write=False)
-            self.aset = self.co.arraysets['aset']
+            try:
+                self.aset = self.co.columns['aset']
+            except AttributeError:
+                self.aset = self.co.arraysets['aset']
         else:
             self.arr = arr
 
@@ -117,4 +122,4 @@ class Read_50by50by10_300_samples(_WriterSuite_NUMPY_10):
 
     time_read = _WriterSuite_NUMPY_10.read
     track_repo_size = _WriterSuite_NUMPY_10.size
-    track_repo_size.unit = 'bytes'
+    track_repo_size.unit = 'bytes'
diff --git a/asv_bench/benchmarks/commit_and_checkout.py b/asv_bench/benchmarks/commit_and_checkout.py
@@ -8,7 +8,7 @@ class MakeCommit(object):
 
     params = [(5_000, 20_000), (5_000, 20_000)]
     param_names = ['num_samples', 'num_metadata']
-    processes = 1
+    processes = 2
     repeat = (2, 4, 20)
     number = 1
     warmup_time = 0
@@ -20,11 +20,11 @@ def setup(self, num_samples, num_metadata):
         self.co = self.repo.checkout(write=True)
         arr = np.array([0,], dtype=np.uint8)
         try:
-            aset = self.co.arraysets.init_arrayset(
-                'aset', prototype=arr, backend_opts='10')
+            aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend_opts='10')
         except TypeError:
-            aset = self.co.arraysets.init_arrayset(
-                'aset', prototype=arr, backend='10')
+            aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend='10')
+        except AttributeError:
+            aset = self.co.add_ndarray_column('aset', prototype=arr, backend='10')
 
         with aset as cm_aset:
             for i in range(num_samples):
@@ -47,7 +47,7 @@ class CheckoutCommit(object):
 
     params = [(5_000, 20_000), (5_000, 20_000)]
     param_names = ['num_samples', 'num_metadata']
-    processes = 1
+    processes = 2
     number = 1
     repeat = (2, 4, 20)
     warmup_time = 0
@@ -59,11 +59,11 @@ def setup(self, num_samples, num_metadata):
         self.co = self.repo.checkout(write=True)
         arr = np.array([0,], dtype=np.uint8)
         try:
-            aset = self.co.arraysets.init_arrayset(
-                'aset', prototype=arr, backend_opts='10')
+            aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend_opts='10')
         except TypeError:
-            aset = self.co.arraysets.init_arrayset(
-                'aset', prototype=arr, backend='10')
+            aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend='10')
+        except AttributeError:
+            aset = self.co.add_ndarray_column('aset', prototype=arr, backend='10')
 
         with aset as cm_aset:
             for i in range(num_samples):
@@ -89,4 +89,4 @@ def time_checkout_read_only(self, num_samples, num_metadata):
 
     def time_checkout_write_enabled(self, num_samples, num_metadata):
         self.co = self.repo.checkout(write=True)
-        self.co.close()
+        self.co.close()