Skip to content

Commit

Permalink
Merge pull request #180 from rlizzo/metadata-columns
Browse files Browse the repository at this point in the history
Column API and DataType Containers
  • Loading branch information
rlizzo authored Mar 4, 2020
2 parents 698a792 + d126d12 commit fc68f58
Show file tree
Hide file tree
Showing 104 changed files with 8,231 additions and 6,208 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/asvbench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- master

jobs:
build:
run_benchmarks:
runs-on: ${{ matrix.os }}
strategy:
max-parallel: 4
Expand All @@ -23,7 +23,8 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install --upgrade setuptools virtualenv
pip install --upgrade setuptools
pip install virtualenv==16.7.9
pip install git+https://github.com/airspeed-velocity/asv
- name: Run Benchmarks
run: |
Expand Down
36 changes: 36 additions & 0 deletions .github/workflows/toxtest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Run Test Suite

on:
pull_request:
branches:
- master


jobs:
run_test_suite:
runs-on: ${{ matrix.platform }}
strategy:
fail-fast: false
matrix:
# https://help.github.com/articles/virtual-environments-for-github-actions
platform:
- ubuntu-latest
- macos-latest
- windows-latest
python-version: [3.6, 3.7]

steps:
- uses: actions/checkout@v1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade setuptools pip wheel
# Use the latest published version for myself :)
python -m pip install tox-gh-actions
- name: Test with tox
run: tox -- -p no:sugar
env:
PYTEST_XDIST_PROC_NR: 2
13 changes: 13 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ Change Log
Improvements
------------

* Column and backend classes are now fully serializable (pickleable) for ``read-only`` checkouts.
(`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
* Modularized internal structure of API classes to easily allow new columnn layouts / data types
to be added in the future.
(`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
* Improved type / value checking of manual specification for column ``backend`` and ``backend_options``.
(`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
* Standardized column data access API to follow python standard library ``dict`` methods API.
(`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
* Memory usage of arrayset checkouts has been reduced by ~70% by using C-structs for allocating
sample record locating info.
(`#179 <https://github.com/tensorwerk/hangar-py/pull/179>`__) `@rlizzo <https://github.com/rlizzo>`__
Expand All @@ -22,6 +31,10 @@ Improvements
New Features
------------

* "string" type columns now supported alongside "ndarray" column type.
(`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
* New "column" API, which replaces "arrayset" name.
(`#180 <https://github.com/tensorwerk/hangar-py/pull/180>`__) `@rlizzo <https://github.com/rlizzo>`__
* Arraysets can now contain "nested subsamples" under a common sample key.
(`#179 <https://github.com/tensorwerk/hangar-py/pull/179>`__) `@rlizzo <https://github.com/rlizzo>`__
* New API to add and remove samples from and arrayset.
Expand Down
23 changes: 18 additions & 5 deletions asv_bench/benchmarks/backend_comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class _WriterSuite:

params = ['hdf5_00', 'hdf5_01', 'numpy_10']
param_names = ['backend']
processes = 1
processes = 2
repeat = (2, 4, 30.0)
# repeat == tuple (min_repeat, max_repeat, max_time)
number = 2
Expand Down Expand Up @@ -59,6 +59,9 @@ def setup(self, backend):
raise NotImplementedError
except ValueError:
raise NotImplementedError
except AttributeError:
self.aset = self.co.add_ndarray_column(
'aset', prototype=self.arr, backend=self.backend_code[backend])

def teardown(self, backend):
self.co.close()
Expand Down Expand Up @@ -91,7 +94,7 @@ class _ReaderSuite:

params = ['hdf5_00', 'hdf5_01', 'numpy_10']
param_names = ['backend']
processes = 1
processes = 2
repeat = (2, 4, 30.0)
# repeat == tuple (min_repeat, max_repeat, max_time)
number = 3
Expand Down Expand Up @@ -134,8 +137,15 @@ def setup_cache(self):
pass
except ValueError:
pass
except AttributeError:
co.add_ndarray_column(backend, prototype=arr, backend=code)

with co.arraysets as asets_cm:
try:
col = co.columns
except AttributeError:
col = co.arraysets

with col as asets_cm:
for aset in asets_cm.values():
changer = 0
for i in range(num_samples):
Expand All @@ -150,7 +160,10 @@ def setup(self, backend):
self.repo = Repository(path=os.getcwd(), exists=True)
self.co = self.repo.checkout(write=False)
try:
self.aset = self.co.arraysets[backend]
try:
self.aset = self.co.columns[backend]
except AttributeError:
self.aset = self.co.arraysets[backend]
except KeyError:
raise NotImplementedError

Expand All @@ -167,4 +180,4 @@ def read(self, backend):
class Read_50by50by10_3000_samples(_ReaderSuite):
method = 'read'
num_samples = 3000
time_read = _ReaderSuite.read
time_read = _ReaderSuite.read
11 changes: 8 additions & 3 deletions asv_bench/benchmarks/backends/hdf5_00.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class _WriterSuite_HDF5_00:

processes = 1
processes = 2
repeat = (2, 4, 20.0)
# repeat == tuple (min_repeat, max_repeat, max_time)
number = 2
Expand Down Expand Up @@ -42,6 +42,8 @@ def setup(self):
except ValueError:
# marks as skipped benchmark for commits which do not have this backend.
raise NotImplementedError
except AttributeError:
self.aset = self.co.add_ndarray_column('aset', prototype=arr, backend='00')

if self.method == 'read':
with self.aset as cm_aset:
Expand All @@ -51,7 +53,10 @@ def setup(self):
self.co.commit('first commit')
self.co.close()
self.co = self.repo.checkout(write=False)
self.aset = self.co.arraysets['aset']
try:
self.aset = self.co.columns['aset']
except AttributeError:
self.aset = self.co.arraysets['aset']
else:
self.arr = arr

Expand Down Expand Up @@ -117,4 +122,4 @@ class Read_50by50by10_300_samples(_WriterSuite_HDF5_00):

time_read = _WriterSuite_HDF5_00.read
track_repo_size = _WriterSuite_HDF5_00.size
track_repo_size.unit = 'bytes'
track_repo_size.unit = 'bytes'
11 changes: 8 additions & 3 deletions asv_bench/benchmarks/backends/hdf5_01.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class _WriterSuite_HDF5_01:

processes = 1
processes = 2
repeat = (2, 4, 20.0)
# repeat == tuple (min_repeat, max_repeat, max_time)
number = 2
Expand Down Expand Up @@ -45,6 +45,8 @@ def setup(self):
except ValueError:
# marks as skipped benchmark for commits which do not have this backend.
raise NotImplementedError
except AttributeError:
self.aset = self.co.add_ndarray_column('aset', prototype=arr, backend='01')

if self.method == 'read':
with self.aset as cm_aset:
Expand All @@ -54,7 +56,10 @@ def setup(self):
self.co.commit('first commit')
self.co.close()
self.co = self.repo.checkout(write=False)
self.aset = self.co.arraysets['aset']
try:
self.aset = self.co.columns['aset']
except AttributeError:
self.aset = self.co.arraysets['aset']
else:
self.arr = arr

Expand Down Expand Up @@ -120,4 +125,4 @@ class Read_50by50by10_300_samples(_WriterSuite_HDF5_01):

time_read = _WriterSuite_HDF5_01.read
track_repo_size = _WriterSuite_HDF5_01.size
track_repo_size.unit = 'bytes'
track_repo_size.unit = 'bytes'
11 changes: 8 additions & 3 deletions asv_bench/benchmarks/backends/numpy_10.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class _WriterSuite_NUMPY_10:

processes = 1
processes = 2
repeat = (2, 4, 20.0)
# repeat == tuple (min_repeat, max_repeat, max_time)
number = 2
Expand Down Expand Up @@ -42,6 +42,8 @@ def setup(self):
except ValueError:
# marks as skipped benchmark for commits which do not have this backend.
raise NotImplementedError
except AttributeError:
self.aset = self.co.add_ndarray_column('aset', prototype=arr, backend='10')

if self.method == 'read':
with self.aset as cm_aset:
Expand All @@ -51,7 +53,10 @@ def setup(self):
self.co.commit('first commit')
self.co.close()
self.co = self.repo.checkout(write=False)
self.aset = self.co.arraysets['aset']
try:
self.aset = self.co.columns['aset']
except AttributeError:
self.aset = self.co.arraysets['aset']
else:
self.arr = arr

Expand Down Expand Up @@ -117,4 +122,4 @@ class Read_50by50by10_300_samples(_WriterSuite_NUMPY_10):

time_read = _WriterSuite_NUMPY_10.read
track_repo_size = _WriterSuite_NUMPY_10.size
track_repo_size.unit = 'bytes'
track_repo_size.unit = 'bytes'
22 changes: 11 additions & 11 deletions asv_bench/benchmarks/commit_and_checkout.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class MakeCommit(object):

params = [(5_000, 20_000), (5_000, 20_000)]
param_names = ['num_samples', 'num_metadata']
processes = 1
processes = 2
repeat = (2, 4, 20)
number = 1
warmup_time = 0
Expand All @@ -20,11 +20,11 @@ def setup(self, num_samples, num_metadata):
self.co = self.repo.checkout(write=True)
arr = np.array([0,], dtype=np.uint8)
try:
aset = self.co.arraysets.init_arrayset(
'aset', prototype=arr, backend_opts='10')
aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend_opts='10')
except TypeError:
aset = self.co.arraysets.init_arrayset(
'aset', prototype=arr, backend='10')
aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend='10')
except AttributeError:
aset = self.co.add_ndarray_column('aset', prototype=arr, backend='10')

with aset as cm_aset:
for i in range(num_samples):
Expand All @@ -47,7 +47,7 @@ class CheckoutCommit(object):

params = [(5_000, 20_000), (5_000, 20_000)]
param_names = ['num_samples', 'num_metadata']
processes = 1
processes = 2
number = 1
repeat = (2, 4, 20)
warmup_time = 0
Expand All @@ -59,11 +59,11 @@ def setup(self, num_samples, num_metadata):
self.co = self.repo.checkout(write=True)
arr = np.array([0,], dtype=np.uint8)
try:
aset = self.co.arraysets.init_arrayset(
'aset', prototype=arr, backend_opts='10')
aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend_opts='10')
except TypeError:
aset = self.co.arraysets.init_arrayset(
'aset', prototype=arr, backend='10')
aset = self.co.arraysets.init_arrayset('aset', prototype=arr, backend='10')
except AttributeError:
aset = self.co.add_ndarray_column('aset', prototype=arr, backend='10')

with aset as cm_aset:
for i in range(num_samples):
Expand All @@ -89,4 +89,4 @@ def time_checkout_read_only(self, num_samples, num_metadata):

def time_checkout_write_enabled(self, num_samples, num_metadata):
self.co = self.repo.checkout(write=True)
self.co.close()
self.co.close()
Loading

0 comments on commit fc68f58

Please sign in to comment.