diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..da55074 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,12 @@ +version: 2 +jobs: + build: + working_directory: ~/sparsity + docker: + - image: drtools/dask:latest + steps: + - checkout + - run: pip install boto3==1.7.84 botocore==1.10.84 moto==1.3.6 + - run: pip install pytest pytest-cov dask==1.0.0 . + - run: py.test --cov sparsity --cov-report xml sparsity + - run: bash <(curl -s https://codecov.io/bash) diff --git a/.coveragerc b/.coveragerc index 3f381eb..468c02e 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,2 +1,2 @@ [run] -omit = sparsity/test/*, */__init__.py \ No newline at end of file +omit = sparsity/test/*, */__init__.py, */_version.py \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..e678b57 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +sparsity/_version.py export-subst diff --git a/.gitignore b/.gitignore index ed66943..0084f4b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ build/ *.so traildb_sparse.c __pycache__ +*.egg-info +*.npz \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..606eeb5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2018, Data Revenue +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..54be32b --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include versioneer.py +include sparsity/_version.py diff --git a/Makefile b/Makefile deleted file mode 100644 index ae7290f..0000000 --- a/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -test: sparsity/traildb.cpython-35m-darwin.so - py.test sparsity/test -s - -sparsity/traildb.cpython-35m-darwin.so: sparsity/_traildb.pyx - python setup.py build_ext --inplace - -clean: - rm -f sparsity/_traildb.c sparsity/_traildb.cpython-35m-darwin.so \ No newline at end of file diff --git a/README.md b/README.md index 8aa470d..90176b3 100644 --- a/README.md +++ b/README.md @@ -2,121 +2,16 @@ [](https://circleci.com/gh/datarevenue-berlin/sparsity) [](https://codecov.io/gh/datarevenue-berlin/sparsity) -Sparse data processing toolbox. It builds on top of pandas and scipy to provide DataFrame -like API to work with sparse categorical data. - -It also provides a extremly fast C level -interface to read from traildb databases. This make it a highly performant package to use -for dataprocessing jobs especially such as log processing and/or clickstream ot click through data. +Sparse data processing toolbox. It builds on top of pandas and scipy to provide +DataFrame-like API to work with sparse data. In combination with dask it provides support to execute complex operations on a concurrent/distributed level. -## Attention -**Not ready for production** - -# Motivation -Many tasks especially in data analytics and machine learning domain make use of sparse -data structures to support the input of high dimensional data. - -This project was started -to build an efficient homogen sparse data processing pipeline. As of today dask has no -support for something as an sparse dataframe. We process big amounts of highdimensional data -on a daily basis at [datarevenue](http://datarevenue.com) and our favourite language -and ETL framework are python and dask. After chaining many function calls on scipy.sparse -csr matrices that involved handling of indices and column names to produce a sparse data -pipeline I decided to start this project. - -This package might be especially usefull to you if you have very big amounts of -sparse data such as clickstream data, categorical timeseries, log data or similarly sparse data. - -# Traildb access? -[Traildb](http://traildb.io/) is an amazing log style database. It was released recently -by AdRoll. It compresses event like data extremly efficient. Furthermore it provides a -fast C-level api to query it. - -Traildb has also python bindings but you still might need to iterate over many million -of users/trail or even both which has quite some overhead in python. -Therefore sparsity provides high speed access to the database in form of SparseFrame objects. -These are fast, efficient and intuitive enough to do further processing on. - -*ATM uuid and timestamp informations are lost but they will be provided as a pandas.MultiIndex -handled by the SparseFrame in a (very soon) future release.* - -```` -In [1]: from sparsity import SparseFrame - -In [2]: sdf = SparseFrame.read_traildb('pydata.tdb', field="title") - -In [3]: sdf.head() -Out[3]: - 0 1 2 3 4 ... 37388 37389 37390 37391 37392 -0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -1 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -2 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -3 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -4 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 - -[5 rows x 37393 columns] +More information and examples can be found in the [documentation](https://sparsity.readthedocs.io). -In [6]: %%timeit - ...: sdf = SparseFrame.read_traildb("/Users/kayibal/Code/traildb_to_sparse/traildb_to_sparse/traildb_to_sparse/sparsity/test/pydata.tdb", field="title") - ...: -10 loops, best of 3: 73.8 ms per loop -In [4]: sdf.shape -Out[4]: (109626, 37393) -```` - -# But wait pandas has SparseDataFrames and SparseSeries -Pandas has it's own implementation of sparse datastructures. Unfortuantely this structures -performs quite badly with a groupby sum aggregation which we also often use. Furthermore - doing a groupby on a pandasSparseDataFrame returns a dense DataFrame. This makes chaining - many groupby operations over multiple files cumbersome and less efficient. Consider -following example: - -``` -In [1]: import sparsity - ...: import pandas as pd - ...: import numpy as np - ...: - -In [2]: data = np.random.random(size=(1000,10)) - ...: data[data < 0.95] = 0 - ...: uids = np.random.randint(0,100,1000) - ...: combined_data = np.hstack([uids.reshape(-1,1),data]) - ...: columns = ['id'] + list(map(str, range(10))) - ...: - ...: sdf = pd.SparseDataFrame(combined_data, columns = columns, default_fill_value=0) - ...: - -In [3]: %%timeit - ...: sdf.groupby('id').sum() - ...: -1 loop, best of 3: 462 ms per loop - -In [4]: res = sdf.groupby('id').sum() - ...: res.values.nbytes - ...: -Out[4]: 7920 - -In [5]: data = np.random.random(size=(1000,10)) - ...: data[data < 0.95] = 0 - ...: uids = np.random.randint(0,100,1000) - ...: sdf = sparsity.SparseFrame(data, columns=np.asarray(list(map(str, range(10)))), index=uids) - ...: - -In [6]: %%timeit - ...: sdf.groupby_sum() - ...: -The slowest run took 4.20 times longer than the fastest. This could mean that an intermediate result is being cached. -1000 loops, best of 3: 1.25 ms per loop - -In [7]: res = sdf.groupby_sum() - ...: res.__sizeof__() - ...: -Out[7]: 6128 +## Installation ``` - -I'm not quite sure if there is some cached result but I don't think so. This only uses a -smart csr matrix multiplication to do the operation. \ No newline at end of file +$ pip install sparsity +``` \ No newline at end of file diff --git a/circle.yml b/circle.yml deleted file mode 100644 index c86a9a2..0000000 --- a/circle.yml +++ /dev/null @@ -1,20 +0,0 @@ -machine: - python: - version: 3.5.2 - environment: - LD_LIBRARY_PATH: '/usr/local/lib' - -dependencies: - cache_directories: - - /home/ubuntu/.cache/pip - override: - - pip install numpy cython 2>&1 - - pip install pytest pytest-cov - - pip install -v scipy pandas - - pip install dask[dataframe] -test: - override: - - pip install -e . - - py.test --cov sparsity --cov-report xml sparsity/test - post: - - bash <(curl -s https://codecov.io/bash) \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..e2f3575 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,155 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +apidoc: + sphinx-apidoc -fME -o api ../sparsity +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/sparsity.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/sparsity.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/sparsity" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/sparsity" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/docs/api/dask-sparseframe-api.rst b/docs/api/dask-sparseframe-api.rst new file mode 100644 index 0000000..8149da2 --- /dev/null +++ b/docs/api/dask-sparseframe-api.rst @@ -0,0 +1,24 @@ +Dask SparseFrame API +=============== + +.. py:currentmodule:: sparsity.dask.core + +.. autosummary:: + SparseFrame + SparseFrame.assign + SparseFrame.compute + SparseFrame.columns + SparseFrame.get_partition + SparseFrame.index + SparseFrame.join + SparseFrame.known_divisions + SparseFrame.map_partitions + SparseFrame.npartitions + SparseFrame.persist + SparseFrame.repartition + SparseFrame.set_index + SparseFrame.rename + SparseFrame.set_index + SparseFrame.sort_index + SparseFrame.to_delayed + SparseFrame.to_npz diff --git a/docs/api/reference.rst b/docs/api/reference.rst new file mode 100644 index 0000000..6b74855 --- /dev/null +++ b/docs/api/reference.rst @@ -0,0 +1,8 @@ +Reference +========= + +.. toctree:: + :maxdepth: 4 + + sparsity + sparsity.dask \ No newline at end of file diff --git a/docs/api/sparseframe-api.rst b/docs/api/sparseframe-api.rst new file mode 100644 index 0000000..e3f4798 --- /dev/null +++ b/docs/api/sparseframe-api.rst @@ -0,0 +1,40 @@ +SparseFrame API +=============== + +.. py:currentmodule:: sparsity.sparse_frame + +.. autosummary:: + SparseFrame + SparseFrame.add + SparseFrame.assign + SparseFrame.axes + SparseFrame.columns + SparseFrame.concat + SparseFrame.copy + SparseFrame.drop + SparseFrame.dropna + SparseFrame.fillna + SparseFrame.groupby_agg + SparseFrame.groupby_sum + SparseFrame.head + SparseFrame.index + SparseFrame.join + SparseFrame.max + SparseFrame.mean + SparseFrame.min + SparseFrame.multiply + SparseFrame.nnz + SparseFrame.read_npz + SparseFrame.reindex + SparseFrame.reindex_axis + SparseFrame.rename + SparseFrame.set_index + SparseFrame.sort_index + SparseFrame.sum + SparseFrame.take + SparseFrame.to_npz + SparseFrame.toarray + SparseFrame.todense + SparseFrame.values + SparseFrame.vstack + diff --git a/docs/api/sparsity.dask.rst b/docs/api/sparsity.dask.rst new file mode 100644 index 0000000..9256195 --- /dev/null +++ b/docs/api/sparsity.dask.rst @@ -0,0 +1,42 @@ +sparsity.dask sub-package +========================= + +.. automodule:: sparsity.dask + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. automodule:: sparsity.dask.core + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sparsity.dask.indexing + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sparsity.dask.io + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sparsity.dask.multi + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sparsity.dask.reshape + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sparsity.dask.shuffle + :members: + :undoc-members: + :show-inheritance: + + diff --git a/docs/api/sparsity.rst b/docs/api/sparsity.rst new file mode 100644 index 0000000..2636127 --- /dev/null +++ b/docs/api/sparsity.rst @@ -0,0 +1,27 @@ +sparsity package +================ + +.. automodule:: sparsity + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. automodule:: sparsity.indexing + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sparsity.io + :members: + :undoc-members: + :show-inheritance: + +.. automodule:: sparsity.sparse_frame + :members: + :undoc-members: + :show-inheritance: + + diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..1ee0de0 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,267 @@ +# -*- coding: utf-8 -*- +# +# Sparsity documentation build configuration file, created by +# sphinx-quickstart. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import os +import sys + +from recommonmark.transform import AutoStructify + +import sparsity +import sphinx_rtd_theme + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +github_doc_root = "http://sparsity.github.com/" + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.autosummary'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# Enable markdown files +source_parsers = { + '.md': 'recommonmark.parser.CommonMarkParser', +} + + +# The suffix of source filenames. +source_suffix = ['.rst', '.md'] + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Sparsity' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = sparsity.__version__.split('-')[0] +# The full version, including alpha/beta/rc tags. +release = sparsity.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { + "collapse_navigation": False, + "analytics_id": 'UA-74267417-1', +} +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# The name for this set of Sphinx documents. If None, it defaults to +# "<project> v<release> documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Sparsitydoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', + 'Sparsity.tex', + u'Sparsity Documentation', + u"Datarevenue", 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# If true, show page references after internal links. +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'Sparsity', u'Sparsity Documentation', + [u"Datarevenue"], 1) +] + +# If true, show URL addresses after external links. +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'Sparsity', u'Sparsity Documentation', + u"Datarevenue", 'Sparsity', + 'Sparsity sparse data processing', 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# texinfo_appendices = [] + +# If false, no module index is generated. +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# texinfo_show_urls = 'footnote' +autosummary_generate = True +html_sidebars = { '**': ['globaltoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html'] } +# At the bottom of conf.py +def setup(app): + app.add_config_value('recommonmark_config', { + 'url_resolver': lambda url: github_doc_root + url, + }, True) + app.add_transform(AutoStructify) diff --git a/docs/images/.gitkeep b/docs/images/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..8c5703d --- /dev/null +++ b/docs/index.md @@ -0,0 +1,41 @@ +# Sparsity - sparse data processing toolbox +[](https://circleci.com/gh/datarevenue-berlin/sparsity) +[](https://codecov.io/gh/datarevenue-berlin/sparsity) + +Sparsity builds on top of Pandas and Scipy to provide DataFrame-like API +to work with numerical homogeneous sparse data. + +Sparsity provides Pandas-like indexing capabilities and group transformations +on Scipy csr matrices. This has proven to be extremely efficient as +shown below. + +Furthermore we provide a distributed implementation of this data structure by +relying on the [Dask](https://dask.pydata.org) framework. This includes +distributed sorting, partitioning, grouping and much more. + +Although we try to mimic the Pandas DataFrame API, some operations +and parameters don't make sense on sparse or homogeneous data. Thus +some interfaces might be changed slightly in their semantics and/or inputs. + +## Install +Sparsity is available from PyPi: +``` +# Install using pip +$ pip install sparsity +``` + +## Contents +```eval_rst +.. toctree:: + :maxdepth: 2 + + sources/about + sources/user_guide + api/sparseframe-api + api/dask-sparseframe-api + api/reference +``` + +## Attention +Please enjoy with carefulness as it is a new project and might still contain +some bugs. diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32c0a04 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,190 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^<target^>` where ^<target^> is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\sparsity.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\sparsity.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +:end diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..f28f12f --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +sphinx +sphinx_rtd_theme +recommonmark \ No newline at end of file diff --git a/docs/sources/about.rst b/docs/sources/about.rst new file mode 100644 index 0000000..63bdfab --- /dev/null +++ b/docs/sources/about.rst @@ -0,0 +1,71 @@ +About Sparsity +============== + +Motivation +---------- +Many tasks, especially in data analytics and machine learning domain, make use +of sparse data structures to support the input of high dimensional data. + +This project was started to build an efficient homogeneous sparse data +processing pipeline. As of today Dask has no support for something as a sparse +dataframe. We process big amounts of high-dimensional data on a daily basis at +Datarevenue_ and our favourite language and ETL +framework are Python and Dask. After chaining many function calls on +scipy.sparse csr matrices that involved handling of indices and column names to +produce a sparse data pipeline, we decided to start this project. + +This package might be especially useful to you if you have very big amounts of +sparse data such as clickstream data, categorical timeseries, log data or +similarly sparse data. + +.. _Datarevenue: https://datarevenue.com + + +Comparison to Pandas SparseDataFrame +------------------------------------ +Pandas has its own implementation of sparse data structures. Unfortunately this +structures perform quite badly with a groupby-sum aggregation which we use +frequently. Furthermore doing a groupby on a Pandas SparseDataFrame returns a +dense DataFrame. This makes chaining many groupby operations over multiple +files cumbersome and less efficient. Consider following example:: + + In [1]: import sparsity + ...: import pandas as pd + ...: import numpy as np + ...: + + In [2]: data = np.random.random(size=(1000,10)) + ...: data[data < 0.95] = 0 + ...: uids = np.random.randint(0,100,1000) + ...: combined_data = np.hstack([uids.reshape(-1,1),data]) + ...: columns = ['id'] + list(map(str, range(10))) + ...: + ...: sdf = pd.SparseDataFrame(combined_data, columns = columns, default_fill_value=0) + ...: + + In [3]: %%timeit + ...: sdf.groupby('id').sum() + ...: + 1 loop, best of 3: 462 ms per loop + + In [4]: res = sdf.groupby('id').sum() + ...: res.values.nbytes + ...: + Out[4]: 7920 + + In [5]: data = np.random.random(size=(1000,10)) + ...: data[data < 0.95] = 0 + ...: uids = np.random.randint(0,100,1000) + ...: sdf = sparsity.SparseFrame(data, columns=np.asarray(list(map(str, range(10)))), index=uids) + ...: + + In [6]: %%timeit + ...: sdf.groupby_sum() + ...: + The slowest run took 4.20 times longer than the fastest. + 1000 loops, best of 3: 1.25 ms per loop + + In [7]: res = sdf.groupby_sum() + ...: res.__sizeof__() + ...: + Out[7]: 6128 diff --git a/docs/sources/user_guide.md b/docs/sources/user_guide.md new file mode 100644 index 0000000..c2302ab --- /dev/null +++ b/docs/sources/user_guide.md @@ -0,0 +1,231 @@ +# Sparsity User Guide + +## Creating a SparseFrame + +Create a SparseFrame from numpy array: +```pycon +>>> import sparsity +>>> import numpy as np + +>>> a = np.random.rand(10, 5) +>>> a[a < 0.9] = 0 +>>> sf = sparsity.SparseFrame(a, index=np.arange(10, 20), columns=list('ABCDE')) +>>> sf + A B C D E +10 0.0 0.000000 0.0 0.000000 0.000000 +11 0.0 0.962851 0.0 0.000000 0.000000 +12 0.0 0.858180 0.0 0.867824 0.930348 +13 0.0 0.000000 0.0 0.000000 0.968163 +14 0.0 0.000000 0.0 0.000000 0.985610 +[10x5 SparseFrame of type '<class 'float64'>' + with 10 stored elements in Compressed Sparse Row format] +``` + +You can also create a SparseFrame from Pandas DataFrame. Index and columns +will be preserved: +```pycon +>>> import pandas as pd + +>>> df = pd.DataFrame(a, index=np.arange(10, 20), columns=list('ABCDE')) +>>> sparsity.SparseFrame(df) + A B C D E +10 0.0 0.000000 0.0 0.000000 0.000000 +11 0.0 0.962851 0.0 0.000000 0.000000 +12 0.0 0.858180 0.0 0.867824 0.930348 +13 0.0 0.000000 0.0 0.000000 0.968163 +14 0.0 0.000000 0.0 0.000000 0.985610 +[10x5 SparseFrame of type '<class 'float64'>' + with 10 stored elements in Compressed Sparse Row format] +``` + +Initialization from Scipy CSR matrix is also possible. If you don't pass +index or columns, defaults will be used: +```pycon +>>> import scipy.sparse + +>>> csr = scipy.sparse.rand(10, 5, density=0.1, format='csr') +>>> sparsity.SparseFrame(csr) + 0 1 2 3 4 +0 0.638314 0.0 0.000000 0.0 0.0 +1 0.000000 0.0 0.000000 0.0 0.0 +2 0.000000 0.0 0.043411 0.0 0.0 +3 0.000000 0.0 0.000000 0.0 0.0 +4 0.000000 0.0 0.222951 0.0 0.0 +[10x5 SparseFrame of type '<class 'float64'>' + with 5 stored elements in Compressed Sparse Row format] +``` + +## Indexing + +Indexing a SparseFrame with column name gives a new SparseFrame: +```pycon +>>> sf['A'] + A +10 0.0 +11 0.0 +12 0.0 +13 0.0 +14 0.0 +[10x1 SparseFrame of type '<class 'float64'>' + with 0 stored elements in Compressed Sparse Row format] +``` + +Similarly for a list of column names: +```pycon +>>> sf[['A', 'B']] + A B +10 0.0 0.000000 +11 0.0 0.962851 +12 0.0 0.858180 +13 0.0 0.000000 +14 0.0 0.000000 +[10x2 SparseFrame of type '<class 'float64'>' + with 3 stored elements in Compressed Sparse Row format] +``` + +## Basic arithmetic operations + +Sum, mean, min and max methods are called on underlying Scipy CSR matrix +object. They can be computed over whole SparseFrame or along columns/rows: +```pycon +>>> sf.sum(axis=0) +matrix([[0. , 2.79813655, 0.84659119, 2.8522892 , 2.88412053]]) + +>>> sf.mean(axis=1) +matrix([[0. ], + [0.19257014], + [0.53127046], + [0.19363253], + [0.19712191], + [0. ], + [0.19913979], + [0.19542124], + [0. ], + [0.36707143]]) + +>>> sf.min() +0.0 + +>>> sf.max() +0.9956989680903189 +``` + +Add 2 SparseFrames: +```pycon +>>> sf.add(sf) + A B C D E +10 0.0 0.000000 0.0 0.000000 0.000000 +11 0.0 1.925701 0.0 0.000000 0.000000 +12 0.0 1.716359 0.0 1.735649 1.860697 +13 0.0 0.000000 0.0 0.000000 1.936325 +14 0.0 0.000000 0.0 0.000000 1.971219 +[10x5 SparseFrame of type '<class 'float64'>' + with 10 stored elements in Compressed Sparse Row format] +``` + +Multiply each row/column by a number: +```pycon +>>> sf.multiply(np.arange(10), axis='index') + A B C D E +10 0.0 0.000000 0.0 0.000000 0.000000 +11 0.0 0.962851 0.0 0.000000 0.000000 +12 0.0 1.716359 0.0 1.735649 1.860697 +13 0.0 0.000000 0.0 0.000000 2.904488 +14 0.0 0.000000 0.0 0.000000 3.942438 +[10x5 SparseFrame of type '<class 'float64'>' + with 10 stored elements in Compressed Sparse Row format] + +>>> sf.multiply(np.arange(5), axis='columns') + A B C D E +10 0.0 0.000000 0.0 0.000000 0.000000 +11 0.0 0.962851 0.0 0.000000 0.000000 +12 0.0 0.858180 0.0 2.603473 3.721393 +13 0.0 0.000000 0.0 0.000000 3.872651 +14 0.0 0.000000 0.0 0.000000 3.942438 +[10x5 SparseFrame of type '<class 'float64'>' + with 10 stored elements in Compressed Sparse Row format] +``` + +## Joining + +By default SparseFrames are joined on their indexes: +```pycon +>>> sf2 = sparsity.SparseFrame(np.random.rand(3, 2), index=[9, 10, 11], columns=['X', 'Y']) +>>> sf2 + X Y +9 0.182890 0.061269 +10 0.039956 0.595605 +11 0.407291 0.496680 +[3x2 SparseFrame of type '<class 'float64'>' + with 6 stored elements in Compressed Sparse Row format] + +>>> sf.join(sf2) + A B C D E X Y +9 0.0 0.000000 0.0 0.000000 0.000000 0.182890 0.061269 +10 0.0 0.000000 0.0 0.000000 0.000000 0.039956 0.595605 +11 0.0 0.962851 0.0 0.000000 0.000000 0.407291 0.496680 +12 0.0 0.858180 0.0 0.867824 0.930348 0.000000 0.000000 +13 0.0 0.000000 0.0 0.000000 0.968163 0.000000 0.000000 +[11x7 SparseFrame of type '<class 'float64'>' + with 16 stored elements in Compressed Sparse Row format] +``` + +You can also join on columns: +```pycon +>>> sf3 = sparsity.SparseFrame(np.random.rand(3, 2), index=[97, 98, 99], columns=['E', 'F']) +>>> sf3 + E F +97 0.738614 0.958507 +98 0.868556 0.230316 +99 0.322914 0.587337 +[3x2 SparseFrame of type '<class 'float64'>' + with 6 stored elements in Compressed Sparse Row format] + +>>> sf.join(sf3, axis=0).iloc[-5:] + A B C D E F +18 0.0 0.0 0.000000 0.000000 0.000000 0.000000 +19 0.0 0.0 0.846591 0.988766 0.000000 0.000000 +97 0.0 0.0 0.000000 0.000000 0.738614 0.958507 +98 0.0 0.0 0.000000 0.000000 0.868556 0.230316 +99 0.0 0.0 0.000000 0.000000 0.322914 0.587337 +[5x6 SparseFrame of type '<class 'float64'>' + with 8 stored elements in Compressed Sparse Row format] +``` + +## Groupby + +Groupby-sum operation is optimized for sparse case: +```pycon +>>> df = pd.DataFrame({'X': [1, 1, 1, 0], +... 'Y': [0, 1, 0, 1], +... 'gr': ['a', 'a', 'b', 'b'], +... 'day': [10, 11, 11, 12]}) +>>> df = df.set_index(['day', 'gr']) +>>> sf4 = sparsity.SparseFrame(df) +>>> sf4 + X Y +day gr +10 a 1.0 0.0 +11 a 1.0 1.0 + b 1.0 0.0 +12 b 0.0 1.0 +[4x2 SparseFrame of type '<class 'float64'>' + with 5 stored elements in Compressed Sparse Row format] + +>>> sf4.groupby_sum(level=1) + X Y +a 2.0 1.0 +b 1.0 1.0 +[2x2 SparseFrame of type '<class 'float64'>' + with 4 stored elements in Compressed Sparse Row format] +``` + +Operations other then sum can also be applied: +```pycon +>>> sf4.groupby_agg(level=1, agg_func=lambda x: x.mean(axis=0)) + X Y +a 1.0 0.5 +b 0.5 0.5 +[2x2 SparseFrame of type '<class 'float64'>' + with 4 stored elements in Compressed Sparse Row format] +``` diff --git a/install_traildbcore.sh b/install_traildbcore.sh deleted file mode 100644 index c1c7953..0000000 --- a/install_traildbcore.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -git clone https://github.com/traildb/traildb traildb-core && pushd traildb-core -wget https://mirrors.kernel.org/ubuntu/pool/universe/j/judy/libjudy-dev_1.0.5-5_amd64.deb \ - https://mirrors.kernel.org/ubuntu/pool/universe/j/judy/libjudydebian1_1.0.5-5_amd64.deb -sudo dpkg -i libjudy-dev_1.0.5-5_amd64.deb libjudydebian1_1.0.5-5_amd64.deb -sudo apt-get update -sudo apt-get install -y libjudy-dev libarchive-dev pkg-config build-essential -sudo python ./waf configure -sudo python ./waf build -sudo python ./waf install \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..275e8e5 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[versioneer] +VCS=git +style=pep440 +versionfile_source=sparsity/_version.py +versionfile_build=sparsity/_version.py +tag_prefix=v diff --git a/setup.py b/setup.py index be9ee18..19ea8e5 100644 --- a/setup.py +++ b/setup.py @@ -1,31 +1,41 @@ -from distutils.core import setup, Extension +import versioneer +from distutils.core import setup from setuptools import find_packages -try: - import traildb - import numpy as np - from Cython.Build import cythonize - ext = Extension("sparsity._traildb", - ['sparsity/_traildb.pyx', - 'sparsity/src/traildb_coo.c', - 'sparsity/src/hashtable.c', - 'sparsity/src/linklist.c'], - include_dirs=['/usr/local/include/', np.get_include()], - libraries=["traildb"]) - ext_modules = cythonize([ext]) -except (ImportError, OSError): - ext_modules = None +packages = find_packages() +packages.remove('sparsity.test') + +with open("README.md", "r") as fh: + long_description = fh.read() + setup( name='sparsity', - version='0.5.1', - ext_modules = ext_modules, + version=versioneer.get_version(), author='Alan Hoeng', author_email='alan.f.hoeng@gmail.com', - packages=find_packages(), + description="Sparse data processing toolbox", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/datarevenue-berlin/sparsity", + packages=packages, + cmdclass=versioneer.get_cmdclass(), install_requires=[ - 'pandas>=0.19.2', - 'scipy>=0.18.1', - 'numpy>=1.12.0' - ], - zip_safe=False -) \ No newline at end of file + 'pandas>=0.21.0,<=0.25.0', + 'scipy>0.19.1', + 'numpy>=1.12.0', + 's3fs>=0.1.0', + 'dask>=2.1.0', + 'fsspec>=0.3.3', + ], + test_requires=[ + 'boto3==1.7.84', + 'botocore==1.10.84', + 'moto==1.3.6' + ], + zip_safe=False, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + ], +) diff --git a/sparsity/__init__.py b/sparsity/__init__.py index 7983873..280cc31 100644 --- a/sparsity/__init__.py +++ b/sparsity/__init__.py @@ -1 +1,4 @@ from sparsity.sparse_frame import SparseFrame, sparse_one_hot +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions diff --git a/sparsity/_traildb.pyx b/sparsity/_traildb.pyx deleted file mode 100644 index b3b93cf..0000000 --- a/sparsity/_traildb.pyx +++ /dev/null @@ -1,45 +0,0 @@ -from libc.stdlib cimport malloc, free -cimport numpy as np -from cpython.bytes cimport PyBytes_FromString -np.import_array() - -cdef extern from "stdint.h": - ctypedef unsigned long long uint64_t - ctypedef unsigned char uint8_t -# cdefine the signature of our c function -cdef extern from "src/traildb_coo.h": - uint64_t traildb_coo_repr (const char * path, const char * fieldname, - uint64_t * row_idx, uint64_t * col_idx, - uint8_t * uids, uint64_t * timestamps, - char** col_names, uint64_t** str_lens) - -# create the wrapper code, with numpy type annotations -def traildb_coo_repr_func(char * path, char * fieldname, - np.ndarray[np.uint64_t, ndim=1, mode="c"] row_idx not None, - np.ndarray[np.uint64_t, ndim=1, mode="c"] col_idx not None, - np.ndarray[np.uint8_t, ndim=2, mode="c"] uuids not None, - np.ndarray[np.uint64_t, ndim=1, mode="c"] timestamps not None): - #cdef uint8_t** uuids = <uint8_t**>malloc(len(row_idx) * sizeof(uint8_t*)) - cdef uint8_t[:,:] cython_uuids_view = uuids - cdef uint8_t *c_uuid_array = &cython_uuids_view[0, 0] - cdef char* col_names; - cdef uint64_t* lens; - n_cols = traildb_coo_repr(path, fieldname, - <uint64_t*> np.PyArray_DATA(row_idx), - <uint64_t*> np.PyArray_DATA(col_idx), - c_uuid_array, - <uint64_t*> np.PyArray_DATA(timestamps), - <char**> &col_names, - <uint64_t**> &lens) - - cols_raw = PyBytes_FromString(col_names) - cols = [] - start = 0 - end = 0 - for i in range(n_cols): - end = start + lens[i] - cols.append(cols_raw[start:end].decode()) - start = end - free(col_names) - free(lens) - return cols \ No newline at end of file diff --git a/sparsity/_version.py b/sparsity/_version.py new file mode 100644 index 0000000..0f6e225 --- /dev/null +++ b/sparsity/_version.py @@ -0,0 +1,520 @@ + +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "None" + cfg.versionfile_source = "sparsity/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} diff --git a/sparsity/dask/__init__.py b/sparsity/dask/__init__.py index fa98a41..8913759 100644 --- a/sparsity/dask/__init__.py +++ b/sparsity/dask/__init__.py @@ -1,22 +1,3 @@ -import dask -from dask.dataframe.core import _get_return_type as \ - df_get_return_type - -import sparsity as sp from .core import SparseFrame -from .io import from_pandas, read_npz +from .io_ import from_pandas, read_npz, from_ddf from .reshape import one_hot_encode - - -def _get_return_type_sparsity(meta): - # We need this to make dask dataframes _LocIndexer to work - # on SparseFrames - if isinstance(meta, SparseFrame): - meta = meta._meta - - if isinstance(meta, sp.SparseFrame): - return SparseFrame - - return df_get_return_type(meta) - -dask.dataframe.core._get_return_type = _get_return_type_sparsity \ No newline at end of file diff --git a/sparsity/dask/core.py b/sparsity/dask/core.py index 617e5df..8bc6cd3 100644 --- a/sparsity/dask/core.py +++ b/sparsity/dask/core.py @@ -1,61 +1,110 @@ -from scipy import sparse +from operator import getitem, itemgetter +from pprint import pformat import dask +import dask.dataframe as dd +import numpy as np import pandas as pd from dask import threaded from dask.base import normalize_token, tokenize -from dask.dataframe.utils import make_meta as dd_make_meta, _nonempty_index +from dask.dataframe import methods +from dask.dataframe.core import (Index, Scalar, Series, _Frame, _emulate, + _extract_meta, _maybe_from_pandas, apply, + check_divisions, funcname, get_parallel_type, + hash_shard, no_default, partial, + partial_by_order, split_evenly, + split_out_on_index) +from dask.dataframe.utils import _nonempty_index, make_meta, meta_nonempty from dask.delayed import Delayed -from dask.optimize import cull -from toolz import merge +from dask.optimization import cull +from dask.utils import derived_from +from scipy import sparse +from toolz import merge, partition_all, remove import sparsity as sp from sparsity.dask.indexing import _LocIndexer -def _make_meta(inp): +@get_parallel_type.register(sp.SparseFrame) +def get_parallel_type_sparsity(_): + return SparseFrame + + +@make_meta.register(sp.SparseFrame) +def make_meta_sparsity(inp): + if isinstance(inp, sp.SparseFrame) and inp.empty: + return inp if isinstance(inp, sp.SparseFrame): return inp.iloc[:0] else: - meta = dd_make_meta(inp) - if isinstance(meta, pd.core.generic.NDFrame): - return sp.SparseFrame(meta) - return meta + raise NotImplementedError("Can't make meta for type: {}" + .format(str(type(inp)))) -def _meta_nonempty(x): + +@meta_nonempty.register(sp.SparseFrame) +def meta_nonempty_sparsity(x): idx = _nonempty_index(x.index) return sp.SparseFrame(sparse.csr_matrix((len(idx), len(x.columns))), index=idx, columns=x.columns) + def optimize(dsk, keys, **kwargs): dsk, _ = cull(dsk, keys) return dsk + def finalize(results): + if all(map(lambda x: x.empty, results)): + return results[0] results = [r for r in results if not r.empty] return sp.SparseFrame.vstack(results) -class SparseFrame(dask.base.Base): - _optimize = staticmethod(optimize) - _default_get = staticmethod(threaded.get) - _finalize = staticmethod(finalize) +class SparseFrame(dask.base.DaskMethodsMixin): def __init__(self, dsk, name, meta, divisions=None): + if isinstance(meta, SparseFrame): + # TODO: remove this case once we subclass from dask._Frame + meta = meta._meta + if not isinstance(meta, sp.SparseFrame): + meta = sp.SparseFrame(meta) + self.dask = dsk self._name = name - self._meta = _make_meta(meta) - - if divisions: - self.known_divisions = True - else: - self.known_divisions = False + self._meta = make_meta(meta) self.divisions = tuple(divisions) self.ndim = 2 self.loc = _LocIndexer(self) + def __getitem__(self, item): + return self.map_partitions(itemgetter(item), self._meta[item], + name='__getitem__') + + def __dask_graph__(self): + return self.dask + + __dask_scheduler__ = staticmethod(dask.threaded.get) + + @staticmethod + def __dask_optimize__(dsk, keys, **kwargs): + # We cull unnecessary tasks here. Note that this isn't necessary, + # dask will do this automatically, this just shows one optimization + # you could do. + dsk2 = optimize(dsk, keys) + return dsk2 + + + def __dask_postpersist__(self): + def rebuild(dsk, *extra_args): + return SparseFrame(dsk, name=self._name, + meta=self._meta, + divisions=self.divisions) + return rebuild, () + + def __dask_postcompute__(self): + return finalize, () @property def npartitions(self): @@ -63,15 +112,68 @@ def npartitions(self): @property def _meta_nonempty(self): - return _meta_nonempty(self._meta) + return meta_nonempty_sparsity(self._meta) + + @property + def columns(self): + return self._meta.columns + + @property + def known_divisions(self): + """Whether divisions are already known""" + return len(self.divisions) > 0 and self.divisions[0] is not None + + @property + def index(self): + """Return dask Index instance""" + name = self._name + '-index' + dsk = {(name, i): (getattr, key, 'index') + for i, key in enumerate(self.__dask_keys__())} + + return Index(merge(dsk, self.dask), name, + self._meta.index, self.divisions) def map_partitions(self, func, meta, *args, **kwargs): return map_partitions(func, self, meta, *args, **kwargs) + # noinspection PyTypeChecker + def todense(self, pandas=True): + """Convert into Dask DataFrame or Series + + Returns + ------- + res: dd.DataFrame | dd.Series + """ + if not pandas: + raise NotImplementedError('Conversion to dask.array is ' + 'currently not supported!') + meta = self._meta.todense() + + dfs = [obj.todense(pandas=pandas) for obj in self.to_delayed()] + + return dd.from_delayed(dfs, meta=meta) + def to_delayed(self): - return [Delayed(k, self.dask) for k in self._keys()] + return [Delayed(k, self.dask) for k in self.__dask_keys__()] + + @derived_from(sp.SparseFrame) + def assign(self, **kwargs): + for k, v in kwargs.items(): + if not (isinstance(v, (Series, Scalar, pd.Series)) or + np.isscalar(v)): + raise TypeError("Column assignment doesn't support type " + "{0}".format(type(v).__name__)) + pairs = list(sum(kwargs.items(), ())) + + # Figure out columns of the output + df2 = self._meta.assign(**_extract_meta(kwargs)) + return elemwise(methods.assign, self, *pairs, meta=df2) - def _keys(self): + @derived_from(sp.SparseFrame) + def add(self, other, how='outer', fill_value=0,): + return elemwise(sp.SparseFrame.add, self, other, meta=self._meta) + + def __dask_keys__(self): return [(self._name, i) for i in range(self.npartitions)] @property @@ -96,6 +198,140 @@ def _repr_data(self): data = [['...'] * len(cols)] * len(index) return pd.DataFrame(data, columns=cols, index=index) + def repartition(self, divisions=None, npartitions=None, force=False): + if divisions is not None: + return repartition(self, divisions, force) + elif npartitions is not None: + return repartition_npartitions(self, npartitions) + raise ValueError('Either divisions or npartitions must be supplied') + + def get_partition(self, n): + """Get a sparse dask DataFrame/Series representing + the `nth` partition.""" + if 0 <= n < self.npartitions: + name = 'get-partition-%s-%s' % (str(n), self._name) + dsk = {(name, 0): (self._name, n)} + divisions = self.divisions[n:n + 2] + return SparseFrame(merge(self.dask, dsk), name, + self._meta, divisions) + else: + msg = "n must be 0 <= n < {0}".format(self.npartitions) + raise ValueError(msg) + + def join(self, other, on=None, how='left', lsuffix='', + rsuffix='', npartitions=None): + from .multi import join_indexed_sparseframes + + if isinstance(other, sp.SparseFrame): + meta = sp.SparseFrame.join(self._meta_nonempty, + other, + how=how) + # make empty meta + meta = meta.loc[[False] * meta.shape[0], :] + join_func = partial(sp.SparseFrame.join, other=other, + how=how) + return self.map_partitions(join_func, meta=meta, name='simplejoin') + if not isinstance(other, (SparseFrame)): + raise ValueError('other must be SparseFrame') + + return join_indexed_sparseframes( + self, other, how=how) + + def to_npz(self, filename, blocksize=None, + storage_options=None, compute=True): + import sparsity.dask.io_ as dsp_io + return dsp_io.to_npz(self, filename, blocksize, storage_options, compute) + + def groupby_sum(self, split_out=1, split_every=8): + meta = self._meta + if self.known_divisions: + res = self.map_partitions(sp.SparseFrame.groupby_sum, + meta=meta) + res.divisions = self.divisions + if split_out and split_out != self.npartitions: + res = res.repartition(npartitions=split_out) + return res + token = 'groupby_sum' + return apply_concat_apply(self, + chunk=sp.SparseFrame.groupby_sum, + aggregate=sp.SparseFrame.groupby_sum, + meta=meta, token=token, split_every=split_every, + split_out=split_out, split_out_setup=split_out_on_index) + + def sort_index(self, npartitions=None, divisions=None, **kwargs): + """Sort the DataFrame index (row labels) + This realigns the dataset to be sorted by the index. This can have a + significant impact on performance, because joins, groupbys, lookups, etc. + are all much faster on that column. However, this performance increase + comes with a cost, sorting a parallel dataset requires expensive shuffles. + Often we ``sort_index`` once directly after data ingest and filtering and + then perform many cheap computations off of the sorted dataset. + This function operates exactly like ``pandas.sort_index`` except with + different performance costs (it is much more expensive). Under normal + operation this function does an initial pass over the index column to + compute approximate qunatiles to serve as future divisions. It then passes + over the data a second time, splitting up each input partition into several + pieces and sharing those pieces to all of the output partitions now in + sorted order. + In some cases we can alleviate those costs, for example if your dataset is + sorted already then we can avoid making many small pieces or if you know + good values to split the new index column then we can avoid the initial + pass over the data. For example if your new index is a datetime index and + your data is already sorted by day then this entire operation can be done + for free. You can control these options with the following parameters. + + Parameters + ---------- + npartitions: int, None, or 'auto' + The ideal number of output partitions. If None use the same as + the input. If 'auto' then decide by memory use. + divisions: list, optional + Known values on which to separate index values of the partitions. + See http://dask.pydata.org/en/latest/dataframe-design.html#partitions + Defaults to computing this with a single pass over the data. Note + that if ``sorted=True``, specified divisions are assumed to match + the existing partitions in the data. If this is untrue, you should + leave divisions empty and call ``repartition`` after ``set_index``. + partition_size: int, optional + if npartitions is set to auto repartition the dataframe into + partitions of this size. + """ + from .shuffle import sort_index + return sort_index(self, npartitions=npartitions, + divisions=divisions, **kwargs) + + @derived_from(sp.SparseFrame) + def set_index(self, column=None, idx=None, level=None): + if column is None and idx is None and level is None: + raise ValueError("Either column, idx or level should not be None") + if idx is not None: + raise NotImplementedError('Only column or level supported') + new_name = self._meta.index.names[level] if level else column + + if level is not None: + new_idx = self._meta.index.get_level_values(level) + else: + new_idx = pd.Index(np.empty((0,0), dtype=self._meta.values.dtype)) + new_idx.name = new_name + + meta = self._meta.set_index(idx=new_idx) + res = self.map_partitions(sp.SparseFrame.set_index, meta=meta, + column=column, idx=idx, level=level) + res.divisions = tuple([None] * ( self.npartitions + 1)) + return res + + def rename(self, columns): + _meta = self._meta.rename(columns=columns) + return self.map_partitions(sp.SparseFrame.rename, meta=_meta, + columns=columns) + + def drop(self, labels, axis=1): + if axis != 1: + raise NotImplementedError('Axis != 1 is currently not supported.') + _meta = self._meta.drop(labels=labels) + return self.map_partitions(sp.SparseFrame.drop, meta=_meta, + labels=labels) + def __repr__(self): return \ """ @@ -110,9 +346,325 @@ def __repr__(self): ) -def map_partitions(func, ddf, meta, **kwargs): +required = {'left': [0], 'right': [1], 'inner': [0, 1], 'outer': []} + + +def repartition(df, divisions=None, force=False): + """ Repartition dataframe along new divisions + Dask.DataFrame objects are partitioned along their index. Often when + multiple dataframes interact we need to align these partitionings. The + ``repartition`` function constructs a new DataFrame object holding the same + data but partitioned on different values. It does this by performing a + sequence of ``loc`` and ``concat`` calls to split and merge the previous + generation of partitions. + Parameters + ---------- + divisions : list + List of partitions to be used + force : bool, default False + Allows the expansion of the existing divisions. + If False then the new divisions lower and upper bounds must be + the same as the old divisions. + Examples + -------- + >>> sf = sf.repartition([0, 5, 10, 20]) # doctest: +SKIP + """ + + token = tokenize(df, divisions) + if isinstance(df, SparseFrame): + tmp = 'repartition-split-' + token + out = 'repartition-merge-' + token + dsk = repartition_divisions(df.divisions, divisions, + df._name, tmp, out, force=force) + return SparseFrame(merge(df.dask, dsk), out, + df._meta, divisions) + raise ValueError('Data must be DataFrame or Series') + + +def repartition_divisions(a, b, name, out1, out2, force=False): + """ dask graph to repartition dataframe by new divisions + + Parameters + ---------- + a : tuple + old divisions + b : tuple, list + new divisions + name : str + name of old dataframe + out1 : str + name of temporary splits + out2 : str + name of new dataframe + force : bool, default False + Allows the expansion of the existing divisions. + If False then the new divisions lower and upper bounds must be + the same as the old divisions. + + Examples + -------- + >>> repartition_divisions([1, 3, 7], [1, 4, 6, 7], 'a', 'b', 'c') # doctest: +SKIP + {('b', 0): (<function boundary_slice at ...>, ('a', 0), 1, 3, False), + ('b', 1): (<function boundary_slice at ...>, ('a', 1), 3, 4, False), + ('b', 2): (<function boundary_slice at ...>, ('a', 1), 4, 6, False), + ('b', 3): (<function boundary_slice at ...>, ('a', 1), 6, 7, False) + ('c', 0): (<function concat at ...>, + (<type 'list'>, [('b', 0), ('b', 1)])), + ('c', 1): ('b', 2), + ('c', 2): ('b', 3)} + """ + check_divisions(b) + + if len(b) < 2: + # minimum division is 2 elements, like [0, 0] + raise ValueError('New division must be longer than 2 elements') + + if force: + if a[0] < b[0]: + msg = ('left side of the new division must be equal or smaller ' + 'than old division') + raise ValueError(msg) + if a[-1] > b[-1]: + msg = ('right side of the new division must be equal or larger ' + 'than old division') + raise ValueError(msg) + else: + if a[0] != b[0]: + msg = 'left side of old and new divisions are different' + raise ValueError(msg) + if a[-1] != b[-1]: + msg = 'right side of old and new divisions are different' + raise ValueError(msg) + + def _is_single_last_div(x): + """Whether last division only contains single label""" + return len(x) >= 2 and x[-1] == x[-2] + + c = [a[0]] + d = dict() + low = a[0] + + i, j = 1, 1 # indices for old/new divisions + k = 0 # index for temp divisions + + last_elem = _is_single_last_div(a) + + # process through old division + # left part of new division can be processed in this loop + while (i < len(a) and j < len(b)): + if a[i] < b[j]: + # tuple is something like: + # (methods.boundary_slice, ('from_pandas-#', 0), 3, 4, False)) + d[(out1, k)] = (methods.boundary_slice, (name, i - 1), low, a[i], False) + low = a[i] + i += 1 + elif a[i] > b[j]: + d[(out1, k)] = (methods.boundary_slice, (name, i - 1), low, b[j], False) + low = b[j] + j += 1 + else: + d[(out1, k)] = (methods.boundary_slice, (name, i - 1), low, b[j], False) + low = b[j] + i += 1 + j += 1 + c.append(low) + k += 1 + + # right part of new division can remain + if a[-1] < b[-1] or b[-1] == b[-2]: + for _j in range(j, len(b)): + # always use right-most of old division + # because it may contain last element + m = len(a) - 2 + d[(out1, k)] = (methods.boundary_slice, (name, m), low, b[_j], False) + low = b[_j] + c.append(low) + k += 1 + else: + # even if new division is processed through, + # right-most element of old division can remain + if last_elem and i < len(a): + d[(out1, k)] = (methods.boundary_slice, (name, i - 1), a[i], a[i], False) + k += 1 + c.append(a[-1]) + + # replace last element of tuple with True + d[(out1, k - 1)] = d[(out1, k - 1)][:-1] + (True,) + + i, j = 0, 1 + + last_elem = _is_single_last_div(c) + + while j < len(b): + tmp = [] + while c[i] < b[j]: + tmp.append((out1, i)) + i += 1 + if last_elem and c[i] == b[-1] and (b[-1] != b[-2] or j == len(b) - 1) and i < k: + # append if last split is not included + tmp.append((out1, i)) + i += 1 + if len(tmp) == 0: + # dummy slice to return empty DataFrame or Series, + # which retain original data attributes (columns / name) + d[(out2, j - 1)] = (methods.boundary_slice, (name, 0), a[0], a[0], False) + elif len(tmp) == 1: + d[(out2, j - 1)] = tmp[0] + else: + if not tmp: + raise ValueError('check for duplicate partitions\nold:\n%s\n\n' + 'new:\n%s\n\ncombined:\n%s' + % (pformat(a), pformat(b), pformat(c))) + d[(out2, j - 1)] = (sp.SparseFrame.vstack, tmp) + j += 1 + return d + + +def repartition_npartitions(df, npartitions): + """ Repartition dataframe to a smaller number of partitions """ + new_name = 'repartition-%d-%s' % (npartitions, tokenize(df)) + if df.npartitions == npartitions: + return df + elif df.npartitions > npartitions: + npartitions_ratio = df.npartitions / npartitions + new_partitions_boundaries = [int(new_partition_index * npartitions_ratio) + for new_partition_index in range(npartitions + 1)] + dsk = {} + for new_partition_index in range(npartitions): + value = (sp.SparseFrame.vstack, + [(df._name, old_partition_index) for old_partition_index in + range(new_partitions_boundaries[new_partition_index], + new_partitions_boundaries[new_partition_index + 1])]) + dsk[new_name, new_partition_index] = value + divisions = [df.divisions[new_partition_index] + for new_partition_index in new_partitions_boundaries] + return SparseFrame(merge(df.dask, dsk), new_name, df._meta, divisions) + else: + original_divisions = divisions = pd.Series(df.divisions) + if (df.known_divisions and (np.issubdtype(divisions.dtype, np.datetime64) or + np.issubdtype(divisions.dtype, np.number))): + if np.issubdtype(divisions.dtype, np.datetime64): + divisions = divisions.values.astype('float64') + + if isinstance(divisions, pd.Series): + divisions = divisions.values + + n = len(divisions) + divisions = np.interp(x=np.linspace(0, n, npartitions + 1), + xp=np.linspace(0, n, n), + fp=divisions) + if np.issubdtype(original_divisions.dtype, np.datetime64): + divisions = pd.Series(divisions).astype(original_divisions.dtype).tolist() + elif np.issubdtype(original_divisions.dtype, np.integer): + divisions = divisions.astype(original_divisions.dtype) + + if isinstance(divisions, np.ndarray): + divisions = divisions.tolist() + + divisions = list(divisions) + divisions[0] = df.divisions[0] + divisions[-1] = df.divisions[-1] + + return df.repartition(divisions=divisions) + else: + ratio = npartitions / df.npartitions + split_name = 'split-%s' % tokenize(df, npartitions) + dsk = {} + last = 0 + j = 0 + for i in range(df.npartitions): + new = last + ratio + if i == df.npartitions - 1: + k = npartitions - j + else: + k = int(new - last) + dsk[(split_name, i)] = (split_evenly, (df._name, i), k) + for jj in range(k): + dsk[(new_name, j)] = (getitem, (split_name, i), jj) + j += 1 + last = new + + divisions = [None] * (npartitions + 1) + return SparseFrame(merge(df.dask, dsk), new_name, df._meta, divisions) + + + +def is_broadcastable(dfs, s): + """ + This Series is broadcastable against another dataframe in the sequence + """ + return (isinstance(s, Series) and + s.npartitions == 1 and + s.known_divisions and + any(s.divisions == (min(df.columns), max(df.columns)) + for df in dfs if isinstance(df, (SparseFrame, dd.DataFrame)))) + + +def elemwise(op, *args, **kwargs): + """ Elementwise operation for dask.Sparseframes + + Parameters + ---------- + op: function + Function that takes as first parameter the underlying df + args: + Contains Dataframes + kwargs: + Contains meta. + """ + meta = kwargs.pop('meta', no_default) + + _name = funcname(op) + '-' + tokenize(op, kwargs, *args) + + # if pd.Series or pd.DataFrame change to dd.DataFrame + args = _maybe_from_pandas(args) + + # Align DataFrame blocks if divisions are different. + from .multi import _maybe_align_partitions # to avoid cyclical import + args = _maybe_align_partitions(args) + + # extract all dask instances + dasks = [arg for arg in args if isinstance(arg, (SparseFrame, _Frame, + Scalar))] + # extract all dask frames + dfs = [df for df in dasks if isinstance(df, (_Frame, SparseFrame))] + + # We take divisions from the first dask frame + divisions = dfs[0].divisions + + _is_broadcastable = partial(is_broadcastable, dfs) + dfs = list(remove(_is_broadcastable, dfs)) + n = len(divisions) - 1 + + other = [(i, arg) for i, arg in enumerate(args) + if not isinstance(arg, (_Frame, Scalar, SparseFrame))] + + # Get dsks graph tuple keys and adjust the key length of Scalar + keys = [d.__dask_keys__() * n if isinstance(d, Scalar) or _is_broadcastable(d) + else d.__dask_keys__() for d in dasks] + + if other: + dsk = {(_name, i): + (apply, partial_by_order, list(frs), + {'function': op, 'other': other}) + for i, frs in enumerate(zip(*keys))} + else: + dsk = {(_name, i): (op,) + frs for i, frs in enumerate(zip(*keys))} + dsk = merge(dsk, *[d.dask for d in dasks]) + + if meta is no_default: + if len(dfs) >= 2 and len(dasks) != len(dfs): + # should not occur in current funcs + msg = 'elemwise with 2 or more DataFrames and Scalar is not supported' + raise NotImplementedError(msg) + meta = _emulate(op, *args, **kwargs) + + return SparseFrame(dsk, _name, meta, divisions) + + +def map_partitions(func, ddf, meta, name=None, **kwargs): dsk = {} - name = func.__name__ + name = name or func.__name__ token = tokenize(func, meta, **kwargs) name = '{0}-{1}'.format(name, token) @@ -125,10 +677,15 @@ def map_partitions(func, ddf, meta, **kwargs): def apply_and_enforce(func, arg, kwargs, meta): sf = func(arg, **kwargs) - columns = meta.columns if isinstance(sf, sp.SparseFrame): if len(sf.data.data) == 0: + assert meta.empty, \ + "Computed empty result but received non-empty meta" + assert isinstance(meta, sp.SparseFrame), \ + "Computed a SparseFrame but meta is of type {}"\ + .format(type(meta)) return meta + columns = meta.columns if (len(columns) == len(sf.columns) and type(columns) is type(sf.columns) and columns.equals(sf.columns)): @@ -139,4 +696,181 @@ def apply_and_enforce(func, arg, kwargs, meta): return sf +def apply_concat_apply(args, chunk=None, aggregate=None, combine=None, + meta=no_default, token=None, chunk_kwargs=None, + aggregate_kwargs=None, combine_kwargs=None, + split_every=None, split_out=None, split_out_setup=None, + split_out_setup_kwargs=None, **kwargs): + """Apply a function to blocks, then concat, then apply again + + Parameters + ---------- + args : + Positional arguments for the `chunk` function. All `dask.dataframe` + objects should be partitioned and indexed equivalently. + chunk : function [block-per-arg] -> block + Function to operate on each block of data + aggregate : function concatenated-block -> block + Function to operate on the concatenated result of chunk + combine : function concatenated-block -> block, optional + Function to operate on intermediate concatenated results of chunk + in a tree-reduction. If not provided, defaults to aggregate. + token : str, optional + The name to use for the output keys. + chunk_kwargs : dict, optional + Keywords for the chunk function only. + aggregate_kwargs : dict, optional + Keywords for the aggregate function only. + combine_kwargs : dict, optional + Keywords for the combine function only. + split_every : int, optional + Group partitions into groups of this size while performing a + tree-reduction. If set to False, no tree-reduction will be used, + and all intermediates will be concatenated and passed to ``aggregate``. + Default is 8. + split_out : int, optional + Number of output partitions. Split occurs after first chunk reduction. + split_out_setup : callable, optional + If provided, this function is called on each chunk before performing + the hash-split. It should return a pandas object, where each row + (excluding the index) is hashed. If not provided, the chunk is hashed + as is. + split_out_setup_kwargs : dict, optional + Keywords for the `split_out_setup` function only. + kwargs : + All remaining keywords will be passed to ``chunk``, ``aggregate``, and + ``combine``. + + Examples + -------- + >>> def chunk(a_block, b_block): + ... pass + + >>> def agg(df): + ... pass + + >>> apply_concat_apply([a, b], chunk=chunk, aggregate=agg) # doctest: +SKIP + """ + if chunk_kwargs is None: + chunk_kwargs = dict() + if aggregate_kwargs is None: + aggregate_kwargs = dict() + chunk_kwargs.update(kwargs) + aggregate_kwargs.update(kwargs) + + if combine is None: + if combine_kwargs: + raise ValueError("`combine_kwargs` provided with no `combine`") + combine = aggregate + combine_kwargs = aggregate_kwargs + else: + if combine_kwargs is None: + combine_kwargs = dict() + combine_kwargs.update(kwargs) + + if not isinstance(args, (tuple, list)): + args = [args] + + npartitions = set(arg.npartitions for arg in args + if isinstance(arg, SparseFrame)) + if len(npartitions) > 1: + raise ValueError("All arguments must have same number of partitions") + npartitions = npartitions.pop() + + if split_every is None: + split_every = 8 + elif split_every is False: + split_every = npartitions + elif split_every < 2 or not isinstance(split_every, int): + raise ValueError("split_every must be an integer >= 2") + + token_key = tokenize(token or (chunk, aggregate), meta, args, + chunk_kwargs, aggregate_kwargs, combine_kwargs, + split_every, split_out, split_out_setup, + split_out_setup_kwargs) + + # Chunk + a = '{0}-chunk-{1}'.format(token or funcname(chunk), token_key) + if len(args) == 1 and isinstance(args[0], SparseFrame) and not chunk_kwargs: + dsk = {(a, 0, i, 0): (chunk, key) + for i, key in enumerate(args[0].__dask_keys__())} + else: + dsk = {(a, 0, i, 0): (apply, chunk, + [(x._name, i) if isinstance(x, SparseFrame) + else x for x in args], chunk_kwargs) + for i in range(args[0].npartitions)} + + # Split + # this splits the blocks (usually) by their index and + # basically performs a task sort such that the next tree + # aggregation will result in the desired number of partitions + # given by the split_out parameter + if split_out and split_out > 1: + split_prefix = 'split-%s' % token_key + shard_prefix = 'shard-%s' % token_key + for i in range(args[0].npartitions): + # For now we assume that split_out_setup selects the index + # as we will only support index groupbys for now. So we can + # use the function provided by dask. + dsk[(split_prefix, i)] = (hash_shard, (a, 0, i, 0), split_out, + split_out_setup, split_out_setup_kwargs) + # At this point we have dictionaries of dataframes. The dictionary keys + # correspond to the hashed index value. Such that rows with the same index + # have the same dictionary key. + # The next line unpacks this dictionaries into pure dataframes again + # now with the correct dask key for their partition. So at this point + # we might have shards of a single row in the next step they are combined again. + for j in range(split_out): + dsk[(shard_prefix, 0, i, j)] = (getitem, (split_prefix, i), j) + a = shard_prefix + else: + split_out = 1 + + # Combine + b = '{0}-combine-{1}'.format(token or funcname(combine), token_key) + k = npartitions + depth = 0 + while k > split_every: + for part_i, inds in enumerate(partition_all(split_every, range(k))): + for j in range(split_out): + conc = (sp.SparseFrame.vstack, [(a, depth, i, j) for i in inds]) + # Finally we apply the combine function on the concatenated + # results. This is usually the same as the aggregate + # function. + if combine_kwargs: + dsk[(b, depth + 1, part_i, j)] = (apply, combine, [conc], combine_kwargs) + else: + dsk[(b, depth + 1, part_i, j)] = (combine, conc) + k = part_i + 1 + a = b + depth += 1 + + # Aggregate + for j in range(split_out): + b = '{0}-agg-{1}'.format(token or funcname(aggregate), token_key) + conc = (sp.SparseFrame.vstack, [(a, depth, i, j) for i in range(k)]) + if aggregate_kwargs: + dsk[(b, j)] = (apply, aggregate, [conc], aggregate_kwargs) + else: + dsk[(b, j)] = (aggregate, conc) + + if meta is no_default: + meta_chunk = _emulate(chunk, *args, **chunk_kwargs) + meta = _emulate(aggregate, sp.SparseFrame.vstack([meta_chunk]), + **aggregate_kwargs) + + for arg in args: + if isinstance(arg, SparseFrame): + dsk.update(arg.dask) + + divisions = [None] * (split_out + 1) + + return SparseFrame(dsk, b, meta, divisions) + + +@get_parallel_type.register(SparseFrame) +def get_parallel_type_distributed(o): + return get_parallel_type(o._meta) + + normalize_token.register((SparseFrame,), lambda a: a._name) diff --git a/sparsity/dask/io.py b/sparsity/dask/io_.py similarity index 53% rename from sparsity/dask/io.py rename to sparsity/dask/io_.py index 3b3a21c..b6c05ae 100644 --- a/sparsity/dask/io.py +++ b/sparsity/dask/io_.py @@ -3,15 +3,41 @@ import numpy as np import pandas as pd +from dask import delayed, base from dask.base import tokenize from dask.dataframe.io.io import sorted_division_locations +from dask.dataframe.utils import make_meta import sparsity as sp -from sparsity.dask.core import SparseFrame, _make_meta +from sparsity.dask.core import SparseFrame +from sparsity.io_ import _write_dict_npz, _open_npz_archive _sorted = sorted +def from_ddf(ddf): + """Convert a dask.dataframe.DataFrame to a sparsity.dask.SparseFrame. + + Parameters + ---------- + ddf: dask.dataframe.DataFrame + + Returns + ------- + dsf: sparsity.dask.SparseFrame + a sparse dataframe collection + """ + if not all(np.issubdtype(dtype, np.number) for + dtype in ddf.dtypes.tolist()): + raise ValueError('Cannot create a sparse frame ' + 'of not numerical type') + + tmp = ddf.map_partitions(sp.SparseFrame, meta=object) + dsf = SparseFrame(tmp.dask, tmp._name, ddf._meta, + divisions=tmp.divisions) + return dsf + + def from_pandas(df, npartitions=None, chunksize=None, name=None): """ Parameters @@ -44,11 +70,11 @@ def from_pandas(df, npartitions=None, chunksize=None, name=None): dsk = dict(((name, i), sp.SparseFrame(df.iloc[start: stop])) for i, (start, stop) in enumerate(zip(locations[:-1], locations[1:]))) - meta = _make_meta(df) + meta = make_meta(df) return SparseFrame(dsk, name, meta, divisions) -def read_npz(path, sorted=False): +def read_npz(path, read_divisions=False, storage_options=None): """ Read SparseFrame from npz archives @@ -57,9 +83,10 @@ def read_npz(path, sorted=False): path: str path to load files from can contain '*' to reference multiple files - sorted: bool + read_divisions: bool if the files are sorted read the index for each file - to obtain divions + to obtain divions. If files are not sorted this will + raise and error. Returns ------- @@ -67,20 +94,32 @@ def read_npz(path, sorted=False): """ dsk = {} name = 'read_npz-{}'.format(tokenize(path)) - _paths = _sorted(list(glob(path))) - archive = np.load(_paths[0]) + loader = None + divisions = None + try: + loader = _open_npz_archive(path.split('*')[0] + 'metadata.npz', + storage_options) + divisions = loader['divisions'] + _paths = loader['partitions'] + except FileNotFoundError: + _paths = _sorted(list(glob(path))) + finally: + if loader: + loader.close() + archive = _open_npz_archive(_paths[0], storage_options) meta_idx, meta_cols = archive['frame_index'], archive['frame_columns'] meta = sp.SparseFrame(np.empty(shape=(0, len(meta_cols))), index=meta_idx[:0], columns=meta_cols) + for i, p in enumerate(_paths): - dsk[name, i] = (sp.SparseFrame.read_npz, p) + dsk[name, i] = (sp.SparseFrame.read_npz, p, storage_options) - if sorted: + if divisions is None and read_divisions: level = 0 if isinstance(meta_idx, pd.MultiIndex) else None divisions = _npz_read_divisions(_paths, level=level) - else: + elif divisions is None: divisions = [None] * (len(_paths) + 1) return SparseFrame(dsk, name, meta, divisions=divisions) @@ -91,11 +130,10 @@ def _npz_read_divisions(paths, level=None): divisions = [] assert len(paths) > 1 for p in paths: - archive = np.load(p) + archive = np.load(p, allow_pickle=True) idx = archive['frame_index'] if level is not None: idx = idx.get_level_values(level) - assert idx.is_monotonic_increasing istart = idx[0] istop = idx[-1] divisions.append(istart) @@ -109,4 +147,39 @@ def _npz_read_divisions(paths, level=None): file=paths[i], div1=divisions[i], div2=divisions[i+1] )) - return divisions \ No newline at end of file + return divisions + + +def write_npz_metadata(writes, divisions, paths, fn, + block_size, storage_options): + data = {} + data['divisions'] = np.asarray(divisions) + data['partitions'] = np.asarray(paths) + + _write_dict_npz(data, fn, block_size, storage_options) + + +def to_npz(sf: SparseFrame, path: str, block_size=None, + storage_options=None, compute=True): + if '*' not in path: + raise ValueError('Path needs to contain "*" wildcard.') + + if '.npz' not in path: + path += '.npz' + + tmpl_func = path.replace('*', '{0:06d}').format + metadata_fn = path.split('*')[0] + 'metadata.npz' + paths = list(map(tmpl_func, range(sf.npartitions))) + + write = delayed(sp.SparseFrame.to_npz, pure=False) + writes = [write(part, fn, block_size, storage_options) + for fn, part in zip(paths, sf.to_delayed())] + + write_metadata = delayed(write_npz_metadata, pure=False) + out = write_metadata(writes, sf.divisions, paths, metadata_fn, + block_size, storage_options) + + if compute: + out.compute() + return None + return out diff --git a/sparsity/dask/multi.py b/sparsity/dask/multi.py new file mode 100644 index 0000000..1d267cc --- /dev/null +++ b/sparsity/dask/multi.py @@ -0,0 +1,111 @@ +import toolz +from dask.base import tokenize + +import sparsity.sparse_frame as sp +import pandas as pd +from dask.dataframe.multi import require, required +from sparsity.dask.core import SparseFrame +from functools import partial +from dask.dataframe.core import is_broadcastable, _Frame, aca +from toolz import unique, merge_sorted + + +def join_indexed_sparseframes(lhs, rhs, how='left'): + """ Join two partitioned sparseframes along their index """ + + (lhs, rhs), divisions, parts = align_partitions(lhs, rhs) + divisions, parts = require(divisions, parts, required[how]) + + left_empty = lhs._meta + right_empty = rhs._meta + + name = 'join-indexed-' + tokenize(lhs, rhs, how) + + dsk = dict() + for i, (a, b) in enumerate(parts): + if a is None and how in ('right', 'outer'): + a = left_empty + if b is None and how in ('left', 'outer'): + b = right_empty + + dsk[(name, i)] = (sp.SparseFrame.join, a, b, 1, how) + + meta = sp.SparseFrame.join(lhs._meta_nonempty, rhs._meta_nonempty, how=how) + return SparseFrame(toolz.merge(lhs.dask, rhs.dask, dsk), + name, meta, divisions) + + +def align_partitions(*dfs): + """ Mutually partition and align DataFrame blocks + + This serves as precursor to multi-dataframe operations like join, concat, + or merge. + + Parameters + ---------- + dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar + Sequence of dataframes to be aligned on their index + + Returns + ------- + dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar + These must have consistent divisions with each other + divisions: tuple + Full divisions sequence of the entire result + result: list + A list of lists of keys that show which data exist on which + divisions + """ + _is_broadcastable = partial(is_broadcastable, dfs) + dfs1 = [df for df in dfs + if isinstance(df, (_Frame, SparseFrame)) and + not _is_broadcastable(df)] + if len(dfs) == 0: + raise ValueError("dfs contains no DataFrame and Series") + if not all(df.known_divisions for df in dfs1): + raise ValueError("Not all divisions are known, can't align " + "partitions. Please use `sort_index` or " + "`set_partition` to set the index.") + + divisions = list(unique(merge_sorted(*[df.divisions for df in dfs1]))) + dfs2 = [df.repartition(divisions, force=True) + if isinstance(df, (SparseFrame)) else df for df in dfs] + + result = list() + inds = [0 for df in dfs] + for d in divisions[:-1]: + L = list() + for i, df in enumerate(dfs2): + if isinstance(df, (_Frame, SparseFrame)): + j = inds[i] + divs = df.divisions + if j < len(divs) - 1 and divs[j] == d: + L.append((df._name, inds[i])) + inds[i] += 1 + else: + L.append(None) + else: # Scalar has no divisions + L.append(None) + result.append(L) + return dfs2, tuple(divisions), result + + +def _maybe_align_partitions(args): + """Align DataFrame blocks if divisions are different. + + Note that if all divisions are unknown, but have equal npartitions, then + they will be passed through unchanged. This is different than + `align_partitions`, which will fail if divisions aren't all known""" + _is_broadcastable = partial(is_broadcastable, args) + dfs = [df for df in args + if isinstance(df, (_Frame, SparseFrame)) and + not _is_broadcastable(df)] + if not dfs: + return args + + divisions = dfs[0].divisions + if not all(df.divisions == divisions for df in dfs): + dfs2 = iter(align_partitions(*dfs)[0]) + return [a if not isinstance(a, (_Frame, SparseFrame)) + else next(dfs2) for a in args] + return args diff --git a/sparsity/dask/reshape.py b/sparsity/dask/reshape.py index 9c5a5ef..879d431 100644 --- a/sparsity/dask/reshape.py +++ b/sparsity/dask/reshape.py @@ -1,41 +1,101 @@ +import warnings +from collections import OrderedDict + +import numpy as np + import sparsity as sp from sparsity import sparse_one_hot from sparsity.dask import SparseFrame -import pandas as pd -import numpy as np -def one_hot_encode(ddf, column, - categories, index_col): + +def one_hot_encode(ddf, column=None, categories=None, index_col=None, + order=None, prefixes=False, + ignore_cat_order_mismatch=False): """ - Sparse one hot encoding of dask.DataFrame + Sparse one hot encoding of dask.DataFrame. - Convert a dask.DataFrame into a series of SparseFrames. By one hot - encoding a single column + Convert a dask.DataFrame into a series of SparseFrames by one-hot + encoding specified columns. Parameters ---------- ddf: dask.DataFrame e.g. the clickstream - column: str - column name to one hot encode in with SparseFrame - categories: iterable - possible category values - index_col: str, iterable + categories: dict + Maps ``column name`` -> ``iterable of possible category values``. + Can be also ``column name`` -> ``None`` if this column is already + of categorical dtype. + This argument decides which column(s) will be encoded. + See description of `order` and `ignore_cat_order_mismatch`. + index_col: str | iterable which columns to use as index + order: iterable + Specify order in which one-hot encoded columns should be aligned. + + If `order = [col_name1, col_name2]` + and `categories = {col_name1: ['A', 'B'], col_name2: ['C', 'D']}`, + then the resulting SparseFrame will have columns + `['A', 'B', 'C', 'D']`. + + If you don't specify order, then output columns' order depends on + iteration over `categories` dictionary. You can pass `categories` + as an OrderedDict instead of providing `order` explicitly. + prefixes: bool + If False, column names will be the same as categories, + so that new columns will be named like: + [cat11, cat12, cat21, cat22, ...]. + + If True, original column name followed by an underscore will be added + in front of each category name, so that new columns will be named like: + [col1_cat11, col1_cat12, col2_cat21, col2_cat22, ...]. + column: DEPRECATED + Kept only for backward compatibility. + ignore_cat_order_mismatch: bool + If a column being one-hot encoded is of categorical dtype, it has + its categories already predefined, so we don't need to explicitly pass + them in `categories` argument (see this argument's description). + However, if we pass them, they may be different than ones defined in + column.cat.categories. In such a situation, a ValueError will be + raised. However, if only orders of categories are different (but sets + of elements are same), you may specify ignore_cat_order_mismatch=True + to suppress this error. In such a situation, column's predefined + categories will be used. Returns ------- - sparse_one_hot: dask.Series + sparse_one_hot: sparsity.dask.SparseFrame """ + if column is not None: + warnings.warn( + '`column` argument of sparsity.dask.reshape.one_hot_encode ' + 'function is deprecated.' + ) + if order is not None: + raise ValueError('`order` and `column` arguments cannot be used ' + 'together.') + categories = {column: categories} + idx_meta = ddf._meta.reset_index().set_index(index_col).index[:0] \ if index_col else ddf._meta.index - meta = sp.SparseFrame(np.array([]), columns=categories, - index=idx_meta) + + if order is not None: + categories = OrderedDict([(column, categories[column]) + for column in order]) + + columns = sparse_one_hot(ddf._meta, + categories=categories, + index_col=index_col, + prefixes=prefixes, + ignore_cat_order_mismatch=ignore_cat_order_mismatch + ).columns + meta = sp.SparseFrame(np.empty(shape=(0, len(columns))), columns=columns, + index=idx_meta) dsf = ddf.map_partitions(sparse_one_hot, - column=column, categories=categories, index_col=index_col, + prefixes=prefixes, + ignore_cat_order_mismatch=ignore_cat_order_mismatch, meta=object) - return SparseFrame(dsf.dask, dsf._name, meta, dsf.divisions) \ No newline at end of file + return SparseFrame(dsf.dask, dsf._name, meta, dsf.divisions) diff --git a/sparsity/dask/shuffle.py b/sparsity/dask/shuffle.py new file mode 100644 index 0000000..e42abc2 --- /dev/null +++ b/sparsity/dask/shuffle.py @@ -0,0 +1,243 @@ +import math +from operator import getitem + +from dask import base, delayed +from dask.sizeof import sizeof +from dask.base import tokenize +from dask.dataframe.shuffle import shuffle_group_get, set_partitions_pre, \ + remove_nans, set_index_post_series +from pandas._libs.algos import groupsort_indexer +from toolz import merge +from dask.utils import digit, insert + +import sparsity as sp +import pandas as pd +import numpy as np + +from sparsity.dask import SparseFrame + + +def sort_index(df, npartitions=None, shuffle='tasks', + drop=True, upsample=1.0, divisions=None, + partition_size=128e6, **kwargs): + """ See _Frame.set_index for docstring """ + if npartitions == 'auto': + repartition = True + npartitions = max(100, df.npartitions) + else: + if npartitions is None: + npartitions = df.npartitions + repartition = False + + index2 = index_to_series(df.index) + + if divisions is None: + divisions = index2._repartition_quantiles(npartitions, upsample=upsample) + if repartition: + parts = df.to_delayed() + sizes = [delayed(sizeof)(part) for part in parts] + else: + sizes = [] + iparts = index2.to_delayed() + mins = [ipart.min() for ipart in iparts] + maxes = [ipart.max() for ipart in iparts] + divisions, sizes, mins, maxes = base.compute(divisions, sizes, mins, maxes) + divisions = divisions.tolist() + + empty_dataframe_detected = pd.isnull(divisions).all() + if repartition or empty_dataframe_detected: + total = sum(sizes) + npartitions = max(math.ceil(total / partition_size), 1) + npartitions = min(npartitions, df.npartitions) + n = len(divisions) + try: + divisions = np.interp(x=np.linspace(0, n - 1, npartitions + 1), + xp=np.linspace(0, n - 1, n), + fp=divisions).tolist() + except (TypeError, ValueError): # str type + indexes = np.linspace(0, n - 1, npartitions + 1).astype(int) + divisions = [divisions[i] for i in indexes] + + return set_partition(df, divisions, shuffle=shuffle, drop=drop, + **kwargs) + + +def index_to_series(idx): + return idx.map_partitions(lambda x: x.to_series(), + meta=idx._meta.to_series()) + + +def set_partition(sf: SparseFrame, divisions: list, + max_branch=32, drop=True, shuffle=None): + """ Group DataFrame by index + + Sets a new index and partitions data along that index according to + divisions. Divisions are often found by computing approximate quantiles. + The function ``set_index`` will do both of these steps. + + Parameters + ---------- + sf: DataFrame/Series + Data that we want to re-partition + index: string or Series + Column to become the new index + divisions: list + Values to form new divisions between partitions + drop: bool, default True + Whether to delete columns to be used as the new index + shuffle: str (optional) + Either 'disk' for an on-disk shuffle or 'tasks' to use the task + scheduling framework. Use 'disk' if you are on a single machine + and 'tasks' if you are on a distributed cluster. + max_branch: int (optional) + If using the task-based shuffle, the amount of splitting each + partition undergoes. Increase this for fewer copies but more + scheduler overhead. + + See Also + -------- + set_index + shuffle + partd + """ + index = index_to_series(sf.index) + partitions = index.map_partitions(set_partitions_pre, + divisions=divisions, + meta=pd.Series([0])) + sf2 = sf.assign(_partitions=partitions) + + df3 = rearrange_by_index(sf2, max_branch=max_branch, + npartitions=len(divisions) - 1, shuffle=shuffle) + + df4 = df3.map_partitions(sort_index_post_series, + index_name=index.name, + meta=sort_index_post_series(df3._meta, index.name)) + + df4.divisions = divisions + + return df4.map_partitions(sp.SparseFrame.sort_index, df4._meta) + + +def sort_index_post_series(df, index_name): + df2 = df.drop('_partitions', axis=1) + df2.index.name = index_name + return df2 + + +def rearrange_by_index(df, npartitions=None, max_branch=None, + shuffle='tasks'): + if shuffle == 'tasks': + return rearrange_by_index_tasks(df, max_branch, npartitions) + else: + raise NotImplementedError("Unknown shuffle method %s" % shuffle) + + +def rearrange_by_index_tasks(df, max_branch=32, npartitions=None): + """ Order divisions of DataFrame so that all values within index align + + This enacts a task-based shuffle + + See also: + rearrange_by_column_disk + set_partitions_tasks + shuffle_tasks + """ + max_branch = max_branch or 32 + n = df.npartitions + + stages = int(np.math.ceil(math.log(n) / math.log(max_branch))) + if stages > 1: + k = int(math.ceil(n ** (1 / stages))) + else: + k = n + + groups = [] + splits = [] + joins = [] + + inputs = [tuple(digit(i, j, k) for j in range(stages)) + for i in range(k**stages)] + + token = tokenize(df, max_branch) + + start = dict((('shuffle-join-' + token, 0, inp), + (df._name, i) if i < df.npartitions else df._meta) + for i, inp in enumerate(inputs)) + + for stage in range(1, stages + 1): + group = dict((('shuffle-group-' + token, stage, inp), + (shuffle_index, ('shuffle-join-' + token, stage - 1, inp), + stage - 1, k, n)) + for inp in inputs) + + split = dict((('shuffle-split-' + token, stage, i, inp), + (getitem, ('shuffle-group-' + token, stage, inp), i)) + for i in range(k) + for inp in inputs) + + join = dict((('shuffle-join-' + token, stage, inp), + (sp.SparseFrame.vstack, + [('shuffle-split-' + token, stage, inp[stage - 1], + insert(inp, stage - 1, j)) for j in range(k)])) + for inp in inputs) + groups.append(group) + splits.append(split) + joins.append(join) + + end = dict((('shuffle-' + token, i), + ('shuffle-join-' + token, stages, inp)) + for i, inp in enumerate(inputs)) + + dsk = merge(df.dask, start, end, *(groups + splits + joins)) + df2 = SparseFrame(dsk, 'shuffle-' + token, df, df.divisions) + + if npartitions is not None and npartitions != df.npartitions: + parts = [i % df.npartitions for i in range(npartitions)] + token = tokenize(df2, npartitions) + dsk = {('repartition-group-' + token, i): (shuffle_group_2, k) + for i, k in enumerate(df2.__dask_keys__())} + for p in range(npartitions): + dsk[('repartition-get-' + token, p)] = \ + (shuffle_group_get, ('repartition-group-' + token, parts[p]), p) + + df3 = SparseFrame(merge(df2.dask, dsk), 'repartition-get-' + token, df2, + [None] * (npartitions + 1)) + else: + df3 = df2 + df3.divisions = (None,) * (df.npartitions + 1) + + return df3 + + +def shuffle_index(sf: sp.SparseFrame, stage, k, npartitions): + ind = sf['_partitions'].todense().astype(np.int) + c = ind._values.reshape(-1) + typ = np.min_scalar_type(npartitions * 2) + c = c.astype(typ) + + npartitions, k, stage = [np.array(x, dtype=np.min_scalar_type(x))[()] + for x in [npartitions, k, stage]] + + c = np.mod(c, npartitions, out=c) + c = np.floor_divide(c, k ** stage, out=c) + c = np.mod(c, k, out=c) + + indexer, locations = groupsort_indexer(c.astype(np.int64), k) + df2 = sf.take(indexer) + locations = locations.cumsum() + parts = [df2.iloc[a:b] for a, b in zip(locations[:-1], locations[1:])] + + return dict(zip(range(k), parts)) + + +def shuffle_group_2(sf: sp.SparseFrame): + if not len(sf): + return {}, sf + ind = sf['_partitions'].todense()._values.astype(np.int64) + n = ind.max() + 1 + indexer, locations = groupsort_indexer(ind.view(np.int64), n) + df2 = sf.take(indexer) + locations = locations.cumsum() + parts = [df2.iloc[a:b] for a, b in zip(locations[:-1], locations[1:])] + result2 = dict(zip(range(n), parts)) + return result2, sf.iloc[:0] diff --git a/sparsity/indexing.py b/sparsity/indexing.py index 90b3e6e..d32466f 100644 --- a/sparsity/indexing.py +++ b/sparsity/indexing.py @@ -1,5 +1,13 @@ from pandas.core.indexing import _LocIndexer, _iLocIndexer +def get_indexers_list(): + + return [ + ('iloc', _CsrILocationIndexer), + ('loc', _CsrLocIndexer), + ] + + class _CsrLocIndexer(_LocIndexer): def __getitem__(self, item): @@ -10,6 +18,7 @@ def _slice(self, slice, axis=0, kind=None): raise NotImplementedError() return self.obj._slice(slice) + class _CsrILocationIndexer(_iLocIndexer): def __getitem__(self, item): @@ -18,4 +27,4 @@ def __getitem__(self, item): def _slice(self, slice, axis=0, kind=None): if axis != 0: raise NotImplementedError() - return self.obj._slice(slice) \ No newline at end of file + return self.obj._slice(slice) diff --git a/sparsity/io.py b/sparsity/io.py deleted file mode 100644 index 35d8fb7..0000000 --- a/sparsity/io.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np -from scipy import sparse - -try: - from traildb import TrailDB - from sparsity._traildb import traildb_coo_repr_func -except (ImportError, OSError): - TrailDB = False - -def traildb_to_coo(db, fieldname): - if not TrailDB: - raise ImportError("Could not find traildb") - db_handle = TrailDB(db) - num_events = db_handle.num_events - del db_handle - r_idx = np.zeros(num_events, dtype=np.uint64) - c_idx = np.zeros(num_events, dtype=np.uint64) - uuids = np.zeros((num_events,16), dtype=np.uint8) - timestamps = np.zeros(num_events, dtype=np.uint64) - - cols = traildb_coo_repr_func(db.encode(), fieldname.encode(), r_idx, - c_idx, uuids, timestamps) - return uuids, timestamps, cols,\ - sparse.coo_matrix((np.ones(num_events), (r_idx, c_idx))) - -def to_npz(sf, filename): - data = _csr_to_dict(sf.data) - data['frame_index'] = sf.index.values - data['frame_columns'] = sf.columns.values - np.savez(filename, **data) - -def read_npz(filename): - loader = np.load(filename) - csr_mat = _load_csr(loader) - idx = loader['frame_index'] - cols = loader['frame_columns'] - return (csr_mat, idx, cols) - -def _csr_to_dict(array): - return dict(data = array.data ,indices=array.indices, - indptr =array.indptr, shape=array.shape) - -def _load_csr(loader): - return sparse.csr_matrix((loader['data'], - loader['indices'], - loader['indptr']), - shape=loader['shape']) \ No newline at end of file diff --git a/sparsity/io_.py b/sparsity/io_.py new file mode 100644 index 0000000..1e900bd --- /dev/null +++ b/sparsity/io_.py @@ -0,0 +1,171 @@ +from io import BytesIO +from pathlib import PurePath, Path +from urllib.parse import urlparse + +import numpy as np +import pandas as pd +from scipy import sparse + +_filesystems = {} + +try: + from dask.bytes.local import LocalFileSystem +except ImportError: + + class LocalFileSystem: + open = open + +_filesystems[''] = LocalFileSystem +_filesystems['file'] = LocalFileSystem + +try: + import s3fs + _filesystems['s3'] = s3fs.S3FileSystem +except ImportError: + pass + +try: + import gcsfs + _filesystems['gs'] = gcsfs.GCSFileSystem + _filesystems['gcs'] = gcsfs.GCSFileSystem +except ImportError: + pass + + +def to_npz(sf, filename, block_size=None, storage_options=None): + """Write to npz file format. + + Parameters + ---------- + sf: sp.SparseFrame + sparse frame to store. + filename: str + path to write to. + block_size: int + block size in bytes when sending data to external filesystem. + Default is 100MB. + storage_options: dict + (optional) storage options for external filesystems. + + Returns + ------- + sf: SparseFrame + """ + filename = path2str(filename) + data = _csr_to_dict(sf.data) + data['metadata'] = \ + {'multiindex': True if isinstance(sf.index, pd.MultiIndex) else False} + data['frame_index'] = sf.index.values + data['frame_columns'] = sf.columns.values + if not filename.endswith('.npz'): + filename += '.npz' + + _write_dict_npz(data, filename, block_size, storage_options) + + +def _write_dict_npz(data, filename, block_size, storage_options): + filename = path2str(filename) + protocol = urlparse(filename).scheme or 'file' + if protocol == 'file': + Path(filename).parent.mkdir(parents=True, exist_ok=True) + with open(filename, 'wb') as fp: + np.savez(fp, **data) + else: + if block_size is None: + block_size = 2 ** 20 * 100 # 100 MB + buffer = BytesIO() + np.savez(buffer, **data) + buffer.seek(0) + _save_remote(buffer, filename, block_size, storage_options) + + +def _save_remote(buffer, filename, block_size=None, storage_options=None): + if storage_options is None: + storage_options = {} + filename = path2str(filename) + protocol = urlparse(filename).scheme + fs = _filesystems[protocol](**storage_options) + with fs.open(filename, 'wb', block_size) as remote_f: + while True: + data = buffer.read(block_size) + if len(data) == 0: + break + remote_f.write(data) + + +def read_npz(filename, storage_options=None): + """Read from a npz file. + + Parameters + ---------- + filename: str + path to file. + storage_options: dict + (optional) storage options for external filesystems. + + Returns + ------- + sf: sp.SparseFrame + """ + loader = _open_npz_archive(filename, storage_options) + try: + csr_mat = _load_csr(loader) + idx = _load_idx_from_npz(loader) + cols = loader['frame_columns'] + finally: + loader.close() + return csr_mat, idx, cols + + +def _open_npz_archive(filename, storage_options=None): + if storage_options is None: + storage_options = {} + filename = path2str(filename) + protocol = urlparse(filename).scheme or 'file' + open_f = _filesystems[protocol](**storage_options).open + fp = open_f(filename, 'rb') + loader = np.load(fp, allow_pickle=True) + return loader + + +def _csr_to_dict(array): + return dict(data = array.data ,indices=array.indices, + indptr =array.indptr, shape=array.shape) + + +def _load_csr(loader): + return sparse.csr_matrix((loader['data'], + loader['indices'], + loader['indptr']), + shape=loader['shape']) + + +def _load_idx_from_npz(loader): + idx = loader['frame_index'] + try: + if loader['metadata'][()]['multiindex']: + idx = pd.MultiIndex.from_tuples(idx) + except KeyError: + if all(map(lambda x: isinstance(x, tuple), idx)): + idx = pd.MultiIndex.from_tuples(idx) + return idx + + +def _just_read_array(path): + path = path2str(path) + if path.endswith('hdf') or path.endswith('hdf5'): + return pd.read_hdf(path, '/df').values + elif path.endswith('csv'): + return pd.read_csv(path).values + elif path.endswith('pickle'): + return pd.read_pickle(path).values + + +def path2str(arg): + """Convert arg into its string representation. + + This is only done if arg is subclass of PurePath + """ + if issubclass(type(arg), PurePath): + return str(arg) + return arg diff --git a/sparsity/sparse_frame.py b/sparsity/sparse_frame.py index d9479e9..3819396 100644 --- a/sparsity/sparse_frame.py +++ b/sparsity/sparse_frame.py @@ -1,50 +1,76 @@ # coding=utf-8 +import functools import traceback -from functools import partial +import warnings +from collections import OrderedDict +from functools import partial, reduce -import pandas as pd import numpy as np -import uuid -from functools import reduce - -from pandas.core.common import _default_index +import pandas as pd from pandas.api import types -from pandas.indexes.base import _ensure_index -from sparsity.io import to_npz, read_npz -from scipy import sparse try: - from sparsity.io import traildb_to_coo - trail_db = True -except: - trail_db = False -from sparsity.indexing import _CsrILocationIndexer, _CsrLocIndexer + # pandas>=0.24.0 + from pandas.core.indexes.base import ensure_index +except ImportError: + try: + # pandas>0.21.0 + from pandas.core.indexes.base import _ensure_index as ensure_index + except ImportError: + # pandas==0.21.* + from pandas.indexes.base import _ensure_index as ensure_index +from sparsity.io_ import to_npz, read_npz, _just_read_array +from scipy import sparse -def _is_empty(data): - try: - if data.nnz == 0: - return True - else: - return False - except: - pass +from sparsity.indexing import get_indexers_list - if len(data) == 0: - return True - elif isinstance(data, list) and sum(map(len, list)) == 0: + +def _is_empty(data): + if any(map(lambda x: x == 0, data.shape)): return True return False + +def _append_zero_row(csr): + return sparse.vstack( + [csr, + sparse.coo_matrix((1, csr.shape[1])).tocsr()] + ) + + +def _default_index(n): + from pandas.core.index import RangeIndex + return RangeIndex(0, n, name=None) + + class SparseFrame(object): - """ - Simple sparse table based on scipy.sparse.csr_matrix + """ Two dimensional, size-mutable, homogenous tabular data structure with + labeled axes (rows and columns). It adds pandas indexing abilities to a + compressed row sparse frame based on scipy.sparse.csr_matrix. This makes + indexing along the first axis extremely efficient and cheap. Indexing along + the second axis should be avoided if possible though. + + For a distributed implementation see sparsity.dask.SparseFrame. """ - __slots__ = ["_index", "_columns", "_data", "shape", - 'ndim', 'iloc', 'loc', 'empty'] + _AXIS_ORDERS = [0, 1] def __init__(self, data, index=None, columns=None, **kwargs): + """Init SparseFrame + + Parameters + ---------- + data: sparse.csr_matrix | np.ndarray | pandas.DataFrame + Data to initialize matrix with. Can be one of above types, or + anything accepted by sparse.csr_matrix along with the correct + kwargs. + index: pd.Index or array-like + Index to use for resulting frame. Will default to RangeIndex if + input data has no indexing information and no index provided. + columns : pd.Index or array-like + Column labels to use for resulting frame. Defaults like in index. + """ if len(data.shape) > 2: raise ValueError("Only two dimensional data supported") @@ -52,44 +78,80 @@ def __init__(self, data, index=None, columns=None, **kwargs): data = data.to_frame() elif len(data.shape) == 1: - data = data.reshape(-1,1) + data = data.reshape(-1, 1) self.empty = False N, K = data.shape if index is None: self._index = _default_index(N) + elif len(index) != N and data.size: + if columns is not None: + implied_axis_1 = len(columns) + else: + implied_axis_1 = data.shape[1] + raise ValueError('Shape of passed values is {},' + 'indices imply {}' + .format(data.shape, (len(index), implied_axis_1))) else: - # assert len(index) == N - self._index = _ensure_index(index) + self._index = ensure_index(index) if columns is None: self._columns = _default_index(K) + elif len(columns) != K and data.size: + if index is not None: + implied_axis_0 = len(index) + else: + implied_axis_0 = data.shape[0] + raise ValueError('Shape of passed values is {},' + 'indices imply {}' + .format(data.shape, (implied_axis_0, len(columns)))) else: - # assert len(columns) == K - self._columns = _ensure_index(columns) + self._columns = ensure_index(columns) if not sparse.isspmatrix_csr(data): try: - self._init_values(data, kwargs) + self._init_values(data, + init_index=index is None, + init_columns=columns is None, + **kwargs) except TypeError: raise TypeError(traceback.format_exc() + "\nThe error described above occurred while " "converting data to sparse matrix.") else: + self.empty = True if _is_empty(data) else False self._init_csr(data) - # register indexers self.ndim = 2 - self.iloc = _CsrILocationIndexer(self, 'iloc') - self.loc = _CsrLocIndexer(self, 'loc') - def _init_values(self, data, kwargs): + @classmethod + def _create_indexer(cls, name, indexer): + """Create an indexer like _name in the class.""" + if getattr(cls, name, None) is None: + _v = tuple(map(int, pd.__version__.split('.'))) + if _v >= (0, 23, 0): + _indexer = functools.partial(indexer, name) + else: + _indexer = functools.partial(indexer, name=name) + setattr(cls, name, property(_indexer, doc=indexer.__doc__)) + + def _init_values(self, data, init_index=True, init_columns=True, **kwargs): if isinstance(data, pd.DataFrame): self.empty = data.empty self._init_csr(sparse.csr_matrix(data.values)) - self._index = _ensure_index(data.index) - self._columns = _ensure_index(data.columns) + if init_index: + self._index = ensure_index(data.index) + else: + warnings.warn("Passed index explicitly while initializing " + "from pd.DataFrame. Original DataFrame's index " + "will be ignored.", SyntaxWarning) + if init_columns: + self._columns = ensure_index(data.columns) + else: + warnings.warn("Passed columns explicitly while initializing " + "from pd.DataFrame. Original DataFrame's columns" + " will be ignored.", SyntaxWarning) elif _is_empty(data): self.empty = True self._data = sparse.csr_matrix((len(self.index), @@ -100,25 +162,53 @@ def _init_values(self, data, kwargs): self._init_csr(sparse_data) def toarray(self): + """Return dense np.array representation.""" return self.todense(pandas=False) def todense(self, pandas=True): + """Return dense representation. + + Parameters + ---------- + pandas: bool + If true returns a pandas DataFrame (default), + else a numpy array is returned. + + Returns + ------- + dense: pd.DataFrame | np.ndarray + dense representation + """ if not self.empty: dense = np.asarray(self.data.toarray()) else: - dense = np.empty(shape=(0, len(self.columns))) + dense = np.empty(shape=(0, len(self.columns)), + dtype=self.data.dtype) if self.shape[0] == 1 or self.shape[1] == 1: dense = dense.reshape(-1) - if pandas == True: + + if pandas: if self.empty: - dense = pd.DataFrame([], columns=self.columns, + dense = pd.DataFrame(np.empty(shape=self.shape, + dtype=self.data.dtype), + columns=self.columns, index=self._index[:0]) - elif len(dense.shape) == 1: + if self.data.shape[1] == 1: # 1 empty column => empty Series + dense = dense.iloc[:, 0] + elif len(dense.shape) == 1 and \ + self.data.shape[1] == 1: # 1 column => Series dense = pd.Series(dense, index=self.index, name=self.columns[0]) - else: - dense = pd.DataFrame(dense, index=self.index, + elif len(dense.shape) == 1 and \ + self.data.shape[1] > 1: # 1 row => DataFrame + dense = pd.DataFrame(dense.reshape(1, -1), index=self.index, + columns=self.columns) + else: # 2+ cols and 2+ rows + # need to copy, as broadcast_to returns read_only array + idx = np.broadcast_to(self.index, dense.shape[0])\ + .copy() + dense = pd.DataFrame(dense, index=idx, columns=self.columns) return dense @@ -126,13 +216,13 @@ def _init_csr(self, csr): """Keep a zero row at the end of the csr matrix for aligns.""" self.shape = csr.shape if not self.empty: - self._data = sparse.vstack( - [csr, - sparse.coo_matrix((1,csr.shape[1])).tocsr() - ]) + self._data = _append_zero_row(csr) else: self._data = csr + def _get_axis_number(self, axis): + return axis + def _get_axis(self, axis): """Rudimentary indexing support.""" if axis == 0: @@ -141,98 +231,245 @@ def _get_axis(self, axis): return self._columns def sum(self, *args, **kwargs): + """Sum elements.""" return self.data.sum(*args, **kwargs) def mean(self, *args, **kwargs): + """Calculate mean(s).""" return self.data.mean(*args, **kwargs) - def std(self, *args, **kwargs): - return self.data.std(*args, **kwargs) - def max(self, *args, **kwargs): + """Find maximum element(s).""" return self.data.max(*args, **kwargs) def min(self, *args, **kwargs): + """Find minimum element(s)""" return self.data.min(*args, **kwargs) - def copy(self, *args, **kwargs): - return SparseFrame(self.data.copy(*args, **kwargs), - self.index.copy(*args, **kwargs), - self.columns.copy(*args, **kwargs)) + def copy(self, *args, deep=True, **kwargs): + """Copy frame + + Parameters + ---------- + args: + are passed to indizes and values copy methods + deep: bool + if true (default) data will be copied as well. + kwargs: + are passed to indizes and values copy methods + + Returns + ------- + copy: SparseFrame + """ + if deep: + return SparseFrame(self.data.copy(*args, **kwargs), + self.index.copy(*args, **kwargs), + self.columns.copy(*args, **kwargs)) + else: + return SparseFrame(self.data, + self.index.copy(*args, **kwargs), + self.columns.copy(*args, **kwargs)) + + def multiply(self, other, axis='columns'): + """ + Multiply SparseFrame row-wise or column-wise. + + Parameters + ---------- + other: array-like + Vector of numbers to multiply columns/rows by. + axis: int | str + - 1 or 'columns' to multiply column-wise (default) + - 0 or 'index' to multiply row-wise + """ + try: + other = other.toarray() + except AttributeError: + pass + + if axis in [0, 'index']: + other = np.asarray(other).reshape(-1, 1) + elif axis in [1, 'columns']: + other = np.asarray(other).reshape(1, -1) + else: + raise ValueError("Axis should be one of 0, 1, 'index', 'columns'.") + + data = self.data.multiply(other) + assert data.shape == self.data.shape, \ + "Data shapes mismatch: {}, {}".format(data.shape, self.data.shape) + return SparseFrame(data, self.index, self.columns) + def nnz(self): + """Get the count of explicitly stored values (nonzeros).""" return self.data.nnz def take(self, idx, axis=0, **kwargs): - """Return data at integer locations.""" + """Return data at integer locations. + + Parameters + ---------- + idx: array-like | int + array of integer locations + axis: + which axis to index + kwargs: + not used + + Returns + ------- + indexed: SparseFrame + reindexed sparse frame + """ if axis == 0: - return SparseFrame(self.data[idx,:], + return SparseFrame(self.data[idx, :], index=self.index[idx], columns=self.columns) elif axis == 1: - return SparseFrame(self.data[:,idx], + return SparseFrame(self.data[:, idx], index=self.index, columns=self.columns[idx]) - def _xs(self, key, *args, **kwargs): + def _take(self, *args, **kwargs): + """ + This function is to mimic pandas api (0.21.0) + and support indexing. + + See https://github.com/pandas-dev/pandas/commit/458c1dc81b7e6f90180b06179ac91d9ed868cb05 + """ + return self.take(*args, **kwargs) + + def _xs(self, key, *args, axis=0, **kwargs): """Used for label based indexing.""" - loc = self.index.get_loc(key) - return SparseFrame(self.data[loc], index=[key], columns=self.columns) + if axis == 0: + loc = self.index.get_loc(key) + new_data = self.data[loc] + return SparseFrame(new_data, + index=[key] * new_data.shape[0], + columns=self.columns) + else: + loc = self.columns.get_loc(key) + new_data = self.data[:, loc] + return SparseFrame(new_data, + columns=[key] * new_data.shape[1], + index=self.index) + @property def index(self): + """ Return index labels + + Returns + ------- + index: pd.Index + """ return self._index @property def columns(self): + """ Return column labels + + Returns + ------- + index: pd.Index + """ return self._columns @property def data(self): + """ Return data matrix + + Returns + ------- + data: scipy.spar.csr_matrix + """ if self.empty: return self._data - return self._data[:-1,:] + return self._data[:-1, :] - # backwards comptability - def groupby(self, by=None, level=0): - return self.groupby_sum(by, level) + def groupby_agg(self, by=None, level=None, agg_func=None): + """ Aggregate data using callable. - def groupby_sum(self, by=None, level=0): + The `by` and `level` arguments are mutually exclusive. + + Parameters + ---------- + by: array-like, string + grouping array or grouping column name + level: int + which level from index to use if multiindex + agg_func: callable + Function which will be applied to groups. Must accept + a SparseFrame and needs to return a vector of shape (1, n_cols). + + Returns + ------- + sf: SparseFrame + aggregated result """ - Sparse groupby sum aggregation. + by, cols = self._get_groupby_col(by, level) + groups = pd.Index(np.arange(self.shape[0])).groupby(by) + res = sparse.csr_matrix((len(groups), self.shape[1])) + new_idx = [] + for i, (name, indices) in enumerate(groups.items()): + new_idx.append(name) + res[i] = agg_func(self.data[indices.values, :]) + res = SparseFrame(res, index=new_idx, columns=self.columns) + return res[cols] + + def groupby_sum(self, by=None, level=0): + """Optimized sparse groupby sum aggregation. Simple operation using sparse matrix multiplication. - Expects result to be sparse aswell. + Expects result to be sparse as well. + + The by and level arguments are mutually exclusive. Parameters ---------- - by: np.ndarray - (optional) alternative index. + by: np.ndarray (optional) + Alternative index. level: int Level of (multi-)index to group on. Returns ------- - df: sparcity.SparseFrame + df: sparsity.SparseFrame Grouped by and summed SparseFrame. """ - if by is not None and by is not "index": - assert len(by) == self.data.shape[0] - by = np.array(by) - else: - if level and isinstance(self._index, pd.MultiIndex): - by = self.index.get_level_values(level).values - elif level: - raise ValueError("Connot use level in a non MultiIndex Frame") - else: - by = self.index.values + by, cols = self._get_groupby_col(by, level) group_idx = by.argsort() gm = _create_group_matrix(by[group_idx]) grouped_data = self._data[group_idx, :].T.dot(gm).T - return SparseFrame(grouped_data, index=np.unique(by), columns=self._columns) + res = SparseFrame(grouped_data, index=np.unique(by), + columns=self._columns) + return res[cols] + + def _get_groupby_col(self, by, level): + if by is None and level is None: + raise ValueError("You have to supply one of 'by' and 'level'.") + other_cols = self._columns.tolist() + if by is not None: + try: + if by in self._columns: + other_cols.remove(by) + by = self[by].toarray() + except TypeError: + assert len(by) == self.data.shape[0] + by = np.array(by) + else: + if level and isinstance(self._index, pd.MultiIndex): + by = self.index.get_level_values(level).values + elif level > 0: + raise ValueError( + "Cannot use level > 0 in a non-MultiIndex Frame.") + else: # level == 0 + by = np.asarray(self._index) + return by, other_cols def join(self, other, axis=1, how='outer', level=None): """ - Join two tables along their indices + Join two tables along their indices. Parameters ---------- @@ -243,53 +480,84 @@ def join(self, other, axis=1, how='outer', level=None): how: str one of 'inner', 'outer', 'left', 'right' level: int - if Multiindex join using this level - + if axis is MultiIndex, join using this level Returns ------- joined: sparsity.SparseFrame """ - if isinstance(self._index, pd.MultiIndex)\ - or isinstance(other._index, pd.MultiIndex): - raise NotImplementedError() + if isinstance(self._index, pd.MultiIndex) \ + or isinstance(other._index, pd.MultiIndex): + raise NotImplementedError('MultiIndex not supported.') if not isinstance(other, SparseFrame): other = SparseFrame(other) - if axis not in set([0, 1]): - raise ValueError("axis mut be either 0 or 1") + if axis not in {0, 1}: + raise ValueError("Axis mut be either 0 or 1.") if axis == 0: - if np.all(other._columns.values == self._columns.values): + if np.array_equal(other._columns.values, self._columns.values): # take short path if join axes are identical data = sparse.vstack([self.data, other.data]) index = np.hstack([self.index, other.index]) res = SparseFrame(data, index=index, columns=self._columns) else: - raise NotImplementedError( - "Joining along axis 0 fails when column names differ." - "This is probably caused by adding all-zeros row.") - data, new_index = _matrix_join(self._data.T.tocsr(), other._data.T.tocsr(), - self._columns, other._columns, - how=how) + data, new_index = _matrix_join( + _append_zero_row(self.data.T.tocsr()), + _append_zero_row(other.data.T.tocsr()), + self._columns, + other._columns, + how=how, + ) res = SparseFrame(data.T.tocsr(), index=np.concatenate([self.index, other.index]), columns=new_index) elif axis == 1: - if np.all(self.index.values == other.index.values): + if np.array_equal(self.index.values, other.index.values): # take short path if join axes are identical data = sparse.hstack([self.data, other.data]) columns = np.hstack([self._columns, other._columns]) res = SparseFrame(data, index=self.index, columns=columns) else: - data, new_index = _matrix_join(self._data, other._data, - self.index, other.index, - how=how) + if other.empty: + other_data = sparse.csr_matrix((1, other.shape[1]), + dtype=other.data.dtype) + else: + other_data = other._data + + if self.empty: + self_data = sparse.csr_matrix((1, self.shape[1]), + dtype=self.data.dtype) + else: + self_data = self._data + + data, new_index = _matrix_join(self_data, other_data, + self.index, other.index, + how=how) res = SparseFrame(data, index=new_index, - columns=np.concatenate([self._columns, other._columns])) + columns=np.concatenate([self._columns, + other._columns])) + else: + raise ValueError('Axis must be either 0 or 1.') + return res + def __len__(self): + return self.shape[0] + def rename(self, columns, inplace=False): """ Rename columns by applying a callable to every column name. + + Parameters + ---------- + columns: callable + a callable that will accepts a column element and returns the + new column label. + inplace: bool + if true the operation will be executed inplace + + Returns + ------- + renamed: SparseFrame | None """ new_cols = self.columns.map(columns) if not inplace: @@ -301,11 +569,12 @@ def rename(self, columns, inplace=False): @property def values(self): + """CSR Matrix represenation of frame""" return self.data def sort_index(self): """ - Sort table along index + Sort table along index. Returns ------- @@ -314,20 +583,52 @@ def sort_index(self): passive_sort_idx = np.argsort(self._index) data = self._data[passive_sort_idx] index = self._index[passive_sort_idx] - return SparseFrame(data, index=index) + return SparseFrame(data, index=index, columns=self.columns) - def add(self, other, how='outer'): + def fillna(self, value): + """Replace NaN values in explicitly stored data with `value`. + + Parameters + ---------- + value: scalar + Value to use to fill holes. value must be of same dtype as + the underlying SparseFrame's data. If 0 is chosen + new matrix will have these values eliminated. + + Returns + ------- + filled: SparseFrame + """ + _data = self._data.copy() + _data.data[np.isnan(self._data.data)] = value + if value == 0: + _data.eliminate_zeros() + return SparseFrame(data=_data[:-1, :], + index=self.index, columns=self.columns) + + def add(self, other, how='outer', fill_value=0, **kwargs): """ Aligned addition. Adds two tables by aligning them first. Parameters ---------- - other: sparsity.SparseFrame + other: sparsity.SparseFrame + Another SparseFrame. + how: str + How to join frames along their indexes. Default is 'outer' which + makes the result contain labels from both frames. + fill_value: float + Fill value if other frame is not exactly the same shape. + For sparse data the only sensible fill value is 0. Passing + any other value will result in a ValueError. Returns ------- - added: sparsity.SparseFrame + added: sparsity.SparseFrame """ + if fill_value != 0: + raise ValueError("Only 0 is accepted as fill_value " + "for sparse data.") assert np.all(self._columns == other.columns) data, new_idx = _aligned_csr_elop(self._data, other._data, self.index, other.index, @@ -356,44 +657,67 @@ def __repr__(self): data = data.toarray() else: cols = self._columns - data = self.data[:nrows,:].toarray() + data = self.data[:nrows, :].toarray() + + df = pd.DataFrame(data, columns=cols, index=self._index[:nrows]) + df_str = df.__repr__().splitlines() + if df_str[-2] == '': + df_str = df_str[:-2] - df = pd.DataFrame(data, - columns=cols, - index=self._index[:nrows] - ) - df_str = df.__repr__().splitlines()[:-2] sparse_str = "[{nrows}x{ncols} SparseFrame of type '<class " \ "'{dtype}'>' \n with {nnz} stored elements " \ "in Compressed Sparse Row format]".format( - nrows=self.shape[0], - ncols=self.shape[1], - dtype=self.data.dtype, - nnz=self.data.nnz - ) - repr = "{data}\n{sparse}"\ - .format(data='\n'.join(df_str), - sparse=sparse_str) + nrows=self.shape[0], + ncols=self.shape[1], + dtype=self.data.dtype, + nnz=self.data.nnz + ) + repr = "{data}\n{sparse}" \ + .format(data='\n'.join(df_str), sparse=sparse_str) return repr def __array__(self): return self.toarray() def head(self, n=1): - """Display head of the sparsed frame.""" + """Return rows from the top of the table. + + Parameters + ---------- + n: int + how many rows to return, default is 1 + + Returns + ------- + head: SparseFrame + """ n = min(n, len(self._index)) - return pd.SparseDataFrame(self.data[:n,:].todense(), + return pd.SparseDataFrame(self.data[:n, :].todense(), index=self.index[:n], columns=self.columns) def _slice(self, sliceobj): - return SparseFrame(self.data[sliceobj,:], + return SparseFrame(self.data[sliceobj, :], index=self.index[sliceobj], columns=self.columns) @classmethod def concat(cls, tables, axis=0): - """Concat a collection of SparseFrames along given axis.""" + """Concat a collection of SparseFrames along given axis. + + Uses join internally so it might not be very efficient. + + Parameters + ---------- + tables: list + a list of SparseFrames. + axis: + which axis to concatenate along. + + Returns + ------- + + """ func = partial(SparseFrame.join, axis=axis) return reduce(func, tables) @@ -407,20 +731,19 @@ def _ixs(self, key, axis=0): index=new_idx, columns=self.columns) - @classmethod - def read_traildb(cls, file, field, ts_unit='s'): - if not trail_db: - raise ImportError("Traildb could not be imported") - uuids, timestamps, cols, coo = traildb_to_coo(file, field) - uuids = np.asarray([uuid.UUID(bytes=x.tobytes()) for x in - uuids]) - index = pd.MultiIndex.from_arrays \ - ([pd.CategoricalIndex(uuids), - pd.to_datetime(timestamps, unit=ts_unit,)], - names=('uuid', 'timestamp')) - return cls(coo.tocsr(), index=index, columns=cols) - def assign(self, **kwargs): + """Assign new columns. + + Parameters + ---------- + kwargs: dict + Mapping from column name to values. Values must be of correct shape + to be inserted successfully. + + Returns + ------- + assigned: SparseFrame + """ sf = self for key, value in kwargs.items(): sf = sf._single_assign(key, value) @@ -451,20 +774,64 @@ def _single_assign(self, key, value): new_cols, new_data = self._add_col(key, value) return SparseFrame(new_data, index=self.index, columns=new_cols) + def drop(self, labels, axis=1): + """Drop label(s) from given axis. + + Currently works only for columns. + + Parameters + ---------- + labels: array-like + labels to drop from the columns + axis: int + only columns are supported atm. + + Returns + ------- + df: SparseFrame + """ + if not isinstance(labels, (list, tuple, set)): + labels = [labels] + if axis == 1: + mask = np.logical_not(self.columns.isin(labels)) + sf = self.loc[:, self.columns[mask].tolist()] + else: + raise NotImplementedError + return sf + def drop_duplicate_idx(self, **kwargs): - """Drop rows with duplicated index.""" + """Drop rows with duplicated index. + + Parameters + ---------- + kwargs: + kwds are passed to pd.Index.duplicated + + Returns + ------- + dropped: SparseFrame + """ mask = ~self.index.duplicated(**kwargs) return SparseFrame(self.data[mask], index=self.index.values[mask], columns=self.columns) def __getitem__(self, item): - if not isinstance(item, (tuple, list)): + if item is None: + raise ValueError('Cannot label index with a null key.') + if not isinstance(item, (pd.Series, np.ndarray, pd.Index, list, + tuple)): + # TODO: tuple probably should be a separate case as in Pandas + # where it is used with Multiindex item = [item] - idx = [] - for key in item: - idx.append(self.columns.get_loc(key)) - return SparseFrame(self.data[:,idx], index=self.index, - columns=[item]) + if len(item) > 0: + indexer = self.loc._convert_to_indexer( + item, axis=1 + ) + return self._take(indexer, axis=1) + else: + data = np.empty(shape=(self.shape[0], 0)) + return SparseFrame(data, index=self.index, + columns=self.columns[[]]) def dropna(self): """Drop nans from index.""" @@ -474,7 +841,24 @@ def dropna(self): return SparseFrame(new_data, index=new_index, columns=self.columns) def set_index(self, column=None, idx=None, level=None, inplace=False): - """Set index from array, column or existing multi-index level.""" + """Set index from array, column or existing multi-index level. + + Parameters + ---------- + column: str + set index from existing column in data. + idx: pd.Index, np.array + Set the index directly with a pandas index object or array + level: int + set index from a multiindex level. useful for groupbys. + inplace: bool + perform data transformation inplace + + Returns + ------- + sf: sp.SparseFrame | None + the transformed sparse frame or None if inplace was True + """ if column is None and idx is None and level is None: raise ValueError("Either column, idx or level should not be None") elif idx is not None: @@ -484,10 +868,10 @@ def set_index(self, column=None, idx=None, level=None, inplace=False): isinstance(self._index, pd.MultiIndex): new_idx = self.index.get_level_values(level) elif column is not None: - new_idx = np.asarray(self[column].data.todense()).reshape(-1) + new_idx = np.asarray(self.loc[:, column].data.todense()).reshape(-1) if inplace: - self._index = _ensure_index(new_idx) + self._index = ensure_index(new_idx) else: return SparseFrame(self.data, index=new_idx, @@ -507,25 +891,199 @@ def vstack(cls, frames): columns=frames[0].columns) @classmethod - def read_npz(cls, filename): - """"Read from numpy npz format.""" - return cls(*read_npz(filename)) + def read_npz(cls, filename, storage_options=None): + """Read from numpy npz format. + + Reads the sparse frame from a npz archive. + Supports reading npz archives from remote locations + with GCSFS and S3FS. + + Parameters + ---------- + filename: str + path or uri to location + storage_options: dict + further options for the underlying filesystem + + Returns + ------- + sf: SparseFrame + """ + return cls(*read_npz(filename, storage_options)) + + @property + def axes(self): + return [self.index, self.columns] + + def _get_axis_name(self, axis): + try: + return ['index', 'columns'][axis] + except IndexError: + raise ValueError('No axis named {} for {}' + .format(axis, self.__class__)) + + def _reindex_with_indexers(self, reindexers, **kwargs): + """allow_dups indicates an internal call here """ + + # reindex doing multiple operations on different axes if indicated + new_data = self.copy() + for axis in sorted(reindexers.keys()): + index, indexer = reindexers[axis] + + if index is None: + continue + + if axis == 0: + new_mat = new_data.data[indexer, :] + new_data = SparseFrame(new_mat, index=index, + columns=new_data.columns) + elif axis == 1: + new_mat = new_data.data[:, indexer] + new_data = SparseFrame(new_mat, columns=index, + index=new_data.index) + else: + raise ValueError('Only supported axes are 0 and 1.') - def to_npz(self, filename): - """Save to numpy npz format.""" - to_npz(self, filename) + return new_data + + def reindex(self, labels=None, index=None, columns=None, axis=None, + *args, **kwargs): + """Conform SparseFrame to new index. + + Missing values will be filled with zeroes. + + Parameters + ---------- + labels: array-like + New labels / index to conform the axis specified by ‘axis’ to. + index, columns : array-like, optional + New labels / index to conform to. Preferably an Index object to + avoid duplicating data + axis: int + Axis to target. Can be either (0, 1). + args, kwargs + Will be passed to reindex_axis. + + Returns + ------- + reindexed: SparseFrame + """ + + if labels is not None and index is None and columns is None: + if axis is None: + axis = 0 + return self.reindex_axis(labels, axis=axis, *args, **kwargs) + elif columns is not None and index is None: + return self.reindex_axis(columns, axis=1, *args, **kwargs) + elif columns is None and index is not None: + return self.reindex_axis(index, axis=0, *args, **kwargs) + elif columns is not None and index is not None: + obj = self.reindex_axis(columns, axis=1, *args, **kwargs) + return obj.reindex_axis(index, axis=0, *args, **kwargs) + else: + raise ValueError('Label parameter is mutually exclusive ' + 'with both index or columns') + + def reindex_axis(self, labels, axis=0, method=None, + level=None, copy=True, limit=None, fill_value=0): + """Conform SparseFrame to new index. + + Missing values will be filled with zeros. + + Parameters + ---------- + labels: array-like + New labels / index to conform the axis specified by ‘axis’ to. + axis: int + Axis to target. Can be either (0, 1). + method: None + unsupported + level: None + unsupported + copy: None + unsupported + limit: None + unsupported + fill_value: None + unsupported + + Returns + ------- + reindexed: SparseFrame + """ + if method is not None \ + or not copy \ + or level is not None \ + or fill_value != 0 \ + or limit is not None: + raise NotImplementedError( + 'Error only labels, index, columns and/or axis are supported') + if axis == 0: + self.index._can_reindex(labels) + reindex_axis = 'index' + other_axis = 'columns' + new_index, idx = self.index.reindex(labels) + if idx is None: + return self.copy() + new_data = self._data[idx] + elif axis == 1: + self.columns._can_reindex(labels) + reindex_axis = 'columns' + other_axis = 'index' + new_index, idx = self.columns.reindex(labels) + if idx is None: + return self.copy() + new_data = self._data.T[idx].T + if not self.empty: + # we have a hidden zero column to replace missing indices (-1) + new_data = new_data[:-1] + else: + raise ValueError("Only two dimensional data supported.") + + kwargs = {reindex_axis: new_index, + other_axis: getattr(self, other_axis)} + + return SparseFrame(new_data, **kwargs) + + def to_npz(self, filename, block_size=None, storage_options=None): + """Save to numpy npz format. + + Parameters + ---------- + filename: str + path to local file ot s3 path starting with `s3://` + block_size: int + block size in bytes only has effect if writing to remote storage + if set to None defaults to 100MB + storage_options: dict + additional parameters to pass to FileSystem class; + only useful when writing to remote storages + """ + to_npz(self, filename, block_size, storage_options) + + +def _axis_is_empty(csr, axis=0): + return csr.shape[axis] == 0 def _aligned_csr_elop(a, b, a_idx, b_idx, op='_plus_', how='outer'): """Assume data == 0 at loc[-1]""" + + # handle emtpy cases + if _axis_is_empty(a): + return b[:-1, :], b_idx + + if _axis_is_empty(b): + return a[:-1, :], a_idx + join_idx, lidx, ridx = a_idx.join(b_idx, return_indexers=True, how=how) if lidx is None: - a_new = a[:-1,:] + a_new = a[:-1, :] else: a_new = sparse.csr_matrix(a[lidx]) if ridx is None: - b_new = b[:-1,:] + b_new = b[:-1, :] else: b_new = sparse.csr_matrix(b[ridx]) @@ -539,11 +1097,11 @@ def _matrix_join(a, b, a_idx, b_idx, how='outer'): join_idx, lidx, ridx = a_idx.join(b_idx, return_indexers=True, how=how) if lidx is None: - a_new = a[:-1,:] + a_new = a[:-1, :] else: a_new = sparse.csr_matrix(a[lidx]) if ridx is None: - b_new = b[:-1,:] + b_new = b[:-1, :] else: b_new = sparse.csr_matrix(b[ridx]) @@ -564,24 +1122,63 @@ def _create_group_matrix(group_idx, dtype='f8'): dtype=dtype).tocsr() -def sparse_one_hot(df, column, categories, dtype='f8', index_col=None): +def sparse_one_hot(df, column=None, categories=None, dtype='f8', + index_col=None, order=None, prefixes=False, + ignore_cat_order_mismatch=False): """ - One-hot encode a single column of a pandas.DataFrame. + One-hot encode specified columns of a pandas.DataFrame. Returns a SparseFrame. + + See the documentation of :func:`sparsity.dask.reshape.one_hot_encode`. """ - cols, csr = _one_hot_series_csr(categories, dtype, df[column]) + if column is not None: + warnings.warn( + '`column` argument of sparsity.sparse_frame.sparse_one_hot ' + 'function is deprecated.' + ) + if order is not None: + raise ValueError('`order` and `column` arguments cannot be used ' + 'together.') + categories = {column: categories} + + if order is not None: + categories = OrderedDict([(column, categories[column]) + for column in order]) + + new_cols = [] + csrs = [] + for column, column_cat in categories.items(): + if isinstance(column_cat, str): + column_cat = _just_read_array(column_cat) + cols, csr = _one_hot_series_csr( + column_cat, dtype, df[column], + ignore_cat_order_mismatch=ignore_cat_order_mismatch + ) + if prefixes: + cols = list(map(lambda x: '{}_{}'.format(column, x), cols)) + new_cols.extend(cols) + csrs.append(csr) + if len(set(new_cols)) < len(new_cols): + raise ValueError('Different columns have same categories. This would ' + 'result in duplicated column names. ' + 'Set `prefix` to True to manage this situation.') + new_data = sparse.hstack(csrs, format='csr') if not isinstance(index_col, list): new_index = df[index_col] if index_col else df.index else: df = df.reset_index() new_index = pd.MultiIndex.from_arrays(df[index_col].values.T) - return SparseFrame(csr, index=new_index, columns=cols) + return SparseFrame(new_data, index=new_index, columns=new_cols) -def _one_hot_series_csr(categories, dtype, oh_col): +def _one_hot_series_csr(categories, dtype, oh_col, + ignore_cat_order_mismatch=False): if types.is_categorical_dtype(oh_col): - cat = oh_col + cat = oh_col.cat + _check_categories_order(cat.categories, categories, oh_col.name, + ignore_cat_order_mismatch) + else: s = oh_col cat = pd.Categorical(s, np.asarray(categories)) @@ -590,12 +1187,41 @@ def _one_hot_series_csr(categories, dtype, oh_col): n_samples = codes.size mask = codes != -1 if np.any(~mask): - raise ValueError("unknown categorical features present %s " - "during transform." % np.unique(s[~mask])) + raise ValueError("Unknown categorical features present " + "during transform: %s." % np.unique(s[~mask])) row_indices = np.arange(n_samples, dtype=np.int32) col_indices = codes data = np.ones(row_indices.size) data = sparse.coo_matrix((data, (row_indices, col_indices)), shape=(n_samples, n_features), dtype=dtype).tocsr() - return cat.categories.values, data \ No newline at end of file + return cat.categories.values, data + + +def _check_categories_order(categories1, categories2, categorical_column_name, + ignore_cat_order_mismatch): + """Check if two lists of categories differ. If they have different + elements, raise an exception. If they differ only by order of elements, + raise an exception unless ignore_cat_order_mismatch is set.""" + + if categories2 is None or list(categories2) == list(categories1): + return + + if set(categories2) == set(categories1): + mismatch_type = 'order' + else: + mismatch_type = 'set' + + if mismatch_type == 'set' or not ignore_cat_order_mismatch: + raise ValueError( + "Got categorical column {column_name} whose categories " + "{mismatch_type} doesn't match categories {mismatch_type} " + "given as argument to this function.".format( + column_name=categorical_column_name, + mismatch_type=mismatch_type + ) + ) + + +for _name, _indexer in get_indexers_list(): + SparseFrame._create_indexer(_name, _indexer) diff --git a/sparsity/src/atomic_defs.h b/sparsity/src/atomic_defs.h deleted file mode 100755 index b50a762..0000000 --- a/sparsity/src/atomic_defs.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef HL_ATOMIC_DEFS_H -#define HL_ATOMIC_DEFS_H - -#define ATOMIC_READ(_v) __sync_fetch_and_add(&(_v), 0) -#define ATOMIC_INCREMENT(_v) (void)__sync_fetch_and_add(&(_v), 1) -#define ATOMIC_DECREMENT(_v) (void)__sync_fetch_and_sub(&(_v), 1) -#define ATOMIC_INCREASE(_v, _n) __sync_add_and_fetch(&(_v), (_n)) -#define ATOMIC_DECREASE(_v, _n) __sync_sub_and_fetch(&(_v), (_n)) -#define ATOMIC_CAS(_v, _o, _n) __sync_bool_compare_and_swap(&(_v), (_o), (_n)) -#define ATOMIC_CAS_RETURN(_v, _o, _n) __sync_val_compare_and_swap(&(_v), (_o), (_n)) - -#define ATOMIC_SET(_v, _n) {\ - int _b = 0;\ - do {\ - _b = ATOMIC_CAS(_v, ATOMIC_READ(_v), _n);\ - } while (__builtin_expect(!_b, 0));\ -} - -#define ATOMIC_SET_IF(_v, _c, _n, _t) {\ - _t _o = ATOMIC_READ(_v);\ - while (__builtin_expect((_o _c (_n)) && !ATOMIC_CAS(_v, _o, _n), 0)) \ - _o = ATOMIC_READ(_v);\ -} - - -#ifdef THREAD_SAFE - -#define __POSIX_C_SOURCE -#include <pthread.h> - -#ifdef __MACH__ -#include <libkern/OSAtomic.h> -#endif - -#define MUTEX_INIT(_mutex) if (__builtin_expect(pthread_mutex_init(&(_mutex), 0) != 0, 0)) { abort(); } -#define MUTEX_DESTROY(_mutex) pthread_mutex_destroy(&(_mutex)) -#define MUTEX_LOCK(_mutex) if (__builtin_expect(pthread_mutex_lock(&(_mutex)) != 0, 0)) { abort(); } -#define MUTEX_UNLOCK(_mutex) if (__builtin_expect(pthread_mutex_unlock(&(_mutex)) != 0, 0)) { abort(); } -#ifdef __MACH__ -#define SPIN_INIT(_mutex) ((_mutex) = 0) -#define SPIN_DESTROY(_mutex) -#define SPIN_LOCK(_mutex) OSSpinLockLock(&(_mutex)) -#define SPIN_UNLOCK(_mutex) OSSpinLockUnlock(&(_mutex)) -#else -#define SPIN_INIT(_mutex) pthread_spin_init(&(_mutex), 0) -#define SPIN_DESTROY(_mutex) pthread_spin_destroy(&(_mutex)) -#define SPIN_LOCK(_mutex) if (__builtin_expect(pthread_spin_lock(&(_mutex)) != 0, 0)) { abort(); } -#define SPIN_UNLOCK(_mutex) if (__builtin_expect(pthread_spin_unlock(&(_mutex)) != 0, 0)) { abort(); } -#endif -#else -#define MUTEX_INIT(_mutex) -#define MUTEX_DESTROY(_mutex) -#define MUTEX_LOCK(_mutex) -#define MUTEX_UNLOCK(_mutex) -#define SPIN_INIT(_mutex) -#define SPIN_DESTROY(_mutex) -#define SPIN_LOCK(_mutex) -#define SPIN_UNLOCK(_mutex) -#endif - -#endif //ATOMIC_DEFS_H - -// vim: tabstop=4 shiftwidth=4 expandtab: -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ diff --git a/sparsity/src/bsd_queue.h b/sparsity/src/bsd_queue.h deleted file mode 100755 index 5ef6c87..0000000 --- a/sparsity/src/bsd_queue.h +++ /dev/null @@ -1,564 +0,0 @@ -/*- - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)queue.h 8.5 (Berkeley) 8/20/94 - * $FreeBSD: src/sys/sys/queue.h,v 1.60.2.1 2005/08/16 22:41:39 phk Exp $ - */ - -#ifndef HL_SYS_QUEUE_H -#define HL_SYS_QUEUE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include <sys/cdefs.h> - -/* - * This file defines four types of data structures: singly-linked lists, - * singly-linked tail queues, lists and tail queues. - * - * A singly-linked list is headed by a single forward pointer. The elements - * are singly linked for minimum space and pointer manipulation overhead at - * the expense of O(n) removal for arbitrary elements. New elements can be - * added to the list after an existing element or at the head of the list. - * Elements being removed from the head of the list should use the explicit - * macro for this purpose for optimum efficiency. A singly-linked list may - * only be traversed in the forward direction. Singly-linked lists are ideal - * for applications with large datasets and few or no removals or for - * implementing a LIFO queue. - * - * A singly-linked tail queue is headed by a pair of pointers, one to the - * head of the list and the other to the tail of the list. The elements are - * singly linked for minimum space and pointer manipulation overhead at the - * expense of O(n) removal for arbitrary elements. New elements can be added - * to the list after an existing element, at the head of the list, or at the - * end of the list. Elements being removed from the head of the tail queue - * should use the explicit macro for this purpose for optimum efficiency. - * A singly-linked tail queue may only be traversed in the forward direction. - * Singly-linked tail queues are ideal for applications with large datasets - * and few or no removals or for implementing a FIFO queue. - * - * A list is headed by a single forward pointer (or an array of forward - * pointers for a hash table header). The elements are doubly linked - * so that an arbitrary element can be removed without a need to - * traverse the list. New elements can be added to the list before - * or after an existing element or at the head of the list. A list - * may only be traversed in the forward direction. - * - * A tail queue is headed by a pair of pointers, one to the head of the - * list and the other to the tail of the list. The elements are doubly - * linked so that an arbitrary element can be removed without a need to - * traverse the list. New elements can be added to the list before or - * after an existing element, at the head of the list, or at the end of - * the list. A tail queue may be traversed in either direction. - * - * For details on the use of these macros, see the queue(3) manual page. - * - * - * SLIST LIST STAILQ TAILQ - * _HEAD + + + + - * _HEAD_INITIALIZER + + + + - * _ENTRY + + + + - * _INIT + + + + - * _EMPTY + + + + - * _FIRST + + + + - * _NEXT + + + + - * _PREV - - - + - * _LAST - - + + - * _FOREACH + + + + - * _FOREACH_SAFE + + + + - * _FOREACH_REVERSE - - - + - * _FOREACH_REVERSE_SAFE - - - + - * _INSERT_HEAD + + + + - * _INSERT_BEFORE - + - + - * _INSERT_AFTER + + + + - * _INSERT_TAIL - - + + - * _CONCAT - - + + - * _REMOVE_HEAD + - + - - * _REMOVE + + + + - * - */ -#define QUEUE_MACRO_DEBUG 0 -#if QUEUE_MACRO_DEBUG -/* Store the last 2 places the queue element or head was altered */ -struct qm_trace { - char * lastfile; - int lastline; - char * prevfile; - int prevline; -}; - -#define TRACEBUF struct qm_trace trace; -#define TRASHIT(x) do {(x) = (void *)-1;} while (0) - -#define QMD_TRACE_HEAD(head) do { \ - (head)->trace.prevline = (head)->trace.lastline; \ - (head)->trace.prevfile = (head)->trace.lastfile; \ - (head)->trace.lastline = __LINE__; \ - (head)->trace.lastfile = __FILE__; \ -} while (0) - -#define QMD_TRACE_ELEM(elem) do { \ - (elem)->trace.prevline = (elem)->trace.lastline; \ - (elem)->trace.prevfile = (elem)->trace.lastfile; \ - (elem)->trace.lastline = __LINE__; \ - (elem)->trace.lastfile = __FILE__; \ -} while (0) - -#else -#define QMD_TRACE_ELEM(elem) -#define QMD_TRACE_HEAD(head) -#define TRACEBUF -#define TRASHIT(x) -#endif /* QUEUE_MACRO_DEBUG */ - -/* - * Singly-linked List declarations. - */ -#define SLIST_HEAD(name, type) \ -struct name { \ - struct type *slh_first; /* first element */ \ -} - -#define SLIST_HEAD_INITIALIZER(head) \ - { NULL } - -#define SLIST_ENTRY(type) \ -struct { \ - struct type *sle_next; /* next element */ \ -} - -/* - * Singly-linked List functions. - */ -#define SLIST_EMPTY(head) ((head)->slh_first == NULL) - -#define SLIST_FIRST(head) ((head)->slh_first) - -#define SLIST_FOREACH(var, head, field) \ - for ((var) = SLIST_FIRST((head)); \ - (var); \ - (var) = SLIST_NEXT((var), field)) - -#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = SLIST_FIRST((head)); \ - (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ - for ((varp) = &SLIST_FIRST((head)); \ - ((var) = *(varp)) != NULL; \ - (varp) = &SLIST_NEXT((var), field)) - -#define SLIST_INIT(head) do { \ - SLIST_FIRST((head)) = NULL; \ -} while (0) - -#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ - SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ - SLIST_NEXT((slistelm), field) = (elm); \ -} while (0) - -#define SLIST_INSERT_HEAD(head, elm, field) do { \ - SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ - SLIST_FIRST((head)) = (elm); \ -} while (0) - -#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) - -#define SLIST_REMOVE(head, elm, type, field) do { \ - if (SLIST_FIRST((head)) == (elm)) { \ - SLIST_REMOVE_HEAD((head), field); \ - } \ - else { \ - struct type *curelm = SLIST_FIRST((head)); \ - while (SLIST_NEXT(curelm, field) != (elm)) \ - curelm = SLIST_NEXT(curelm, field); \ - SLIST_NEXT(curelm, field) = \ - SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ - } \ -} while (0) - -#define SLIST_REMOVE_HEAD(head, field) do { \ - SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ -} while (0) - -/* - * Singly-linked Tail queue declarations. - */ -#define STAILQ_HEAD(name, type) \ -struct name { \ - struct type *stqh_first;/* first element */ \ - struct type **stqh_last;/* addr of last next element */ \ -} - -#define STAILQ_HEAD_INITIALIZER(head) \ - { NULL, &(head).stqh_first } - -#define STAILQ_ENTRY(type) \ -struct { \ - struct type *stqe_next; /* next element */ \ -} - -/* - * Singly-linked Tail queue functions. - */ -#define STAILQ_CONCAT(head1, head2) do { \ - if (!STAILQ_EMPTY((head2))) { \ - *(head1)->stqh_last = (head2)->stqh_first; \ - (head1)->stqh_last = (head2)->stqh_last; \ - STAILQ_INIT((head2)); \ - } \ -} while (0) - -#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) - -#define STAILQ_FIRST(head) ((head)->stqh_first) - -#define STAILQ_FOREACH(var, head, field) \ - for((var) = STAILQ_FIRST((head)); \ - (var); \ - (var) = STAILQ_NEXT((var), field)) - - -#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = STAILQ_FIRST((head)); \ - (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define STAILQ_INIT(head) do { \ - STAILQ_FIRST((head)) = NULL; \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ -} while (0) - -#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ - if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ - STAILQ_NEXT((tqelm), field) = (elm); \ -} while (0) - -#define STAILQ_INSERT_HEAD(head, elm, field) do { \ - if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ - STAILQ_FIRST((head)) = (elm); \ -} while (0) - -#define STAILQ_INSERT_TAIL(head, elm, field) do { \ - STAILQ_NEXT((elm), field) = NULL; \ - *(head)->stqh_last = (elm); \ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ -} while (0) - -#define STAILQ_LAST(head, type, field) \ - (STAILQ_EMPTY((head)) ? \ - NULL : \ - ((struct type *) \ - ((char *)((head)->stqh_last) - __offsetof(struct type, field)))) - -#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) - -#define STAILQ_REMOVE(head, elm, type, field) do { \ - if (STAILQ_FIRST((head)) == (elm)) { \ - STAILQ_REMOVE_HEAD((head), field); \ - } \ - else { \ - struct type *curelm = STAILQ_FIRST((head)); \ - while (STAILQ_NEXT(curelm, field) != (elm)) \ - curelm = STAILQ_NEXT(curelm, field); \ - if ((STAILQ_NEXT(curelm, field) = \ - STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ - (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ - } \ -} while (0) - -#define STAILQ_REMOVE_HEAD(head, field) do { \ - if ((STAILQ_FIRST((head)) = \ - STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ -} while (0) - -#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ - if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ -} while (0) - -/* - * List declarations. - */ -#define LIST_HEAD(name, type) \ -struct name { \ - struct type *lh_first; /* first element */ \ -} - -#define LIST_HEAD_INITIALIZER(head) \ - { NULL } - -#define LIST_ENTRY(type) \ -struct { \ - struct type *le_next; /* next element */ \ - struct type **le_prev; /* address of previous next element */ \ -} - -/* - * List functions. - */ - -#define LIST_EMPTY(head) ((head)->lh_first == NULL) - -#define LIST_FIRST(head) ((head)->lh_first) - -#define LIST_FOREACH(var, head, field) \ - for ((var) = LIST_FIRST((head)); \ - (var); \ - (var) = LIST_NEXT((var), field)) - -#define LIST_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = LIST_FIRST((head)); \ - (var) && ((tvar) = LIST_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define LIST_INIT(head) do { \ - LIST_FIRST((head)) = NULL; \ -} while (0) - -#define LIST_INSERT_AFTER(listelm, elm, field) do { \ - if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ - LIST_NEXT((listelm), field)->field.le_prev = \ - &LIST_NEXT((elm), field); \ - LIST_NEXT((listelm), field) = (elm); \ - (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ -} while (0) - -#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ - (elm)->field.le_prev = (listelm)->field.le_prev; \ - LIST_NEXT((elm), field) = (listelm); \ - *(listelm)->field.le_prev = (elm); \ - (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ -} while (0) - -#define LIST_INSERT_HEAD(head, elm, field) do { \ - if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ - LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ - LIST_FIRST((head)) = (elm); \ - (elm)->field.le_prev = &LIST_FIRST((head)); \ -} while (0) - -#define LIST_NEXT(elm, field) ((elm)->field.le_next) - -#define LIST_REMOVE(elm, field) do { \ - if (LIST_NEXT((elm), field) != NULL) \ - LIST_NEXT((elm), field)->field.le_prev = \ - (elm)->field.le_prev; \ - *(elm)->field.le_prev = LIST_NEXT((elm), field); \ -} while (0) - -/* - * Tail queue declarations. - */ -#define TAILQ_HEAD(name, type) \ -struct name { \ - struct type *tqh_first; /* first element */ \ - struct type **tqh_last; /* addr of last next element */ \ - TRACEBUF \ -} - -#define TAILQ_HEAD_INITIALIZER(head) \ - { NULL, &(head).tqh_first } - -#define TAILQ_ENTRY(type) \ -struct { \ - struct type *tqe_next; /* next element */ \ - struct type **tqe_prev; /* address of previous next element */ \ - TRACEBUF \ -} - -/* - * Tail queue functions. - */ -#define TAILQ_CONCAT(head1, head2, field) do { \ - if (!TAILQ_EMPTY(head2)) { \ - *(head1)->tqh_last = (head2)->tqh_first; \ - (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ - (head1)->tqh_last = (head2)->tqh_last; \ - TAILQ_INIT((head2)); \ - QMD_TRACE_HEAD(head1); \ - QMD_TRACE_HEAD(head2); \ - } \ -} while (0) - -#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) - -#define TAILQ_FIRST(head) ((head)->tqh_first) - -#define TAILQ_FOREACH(var, head, field) \ - for ((var) = TAILQ_FIRST((head)); \ - (var); \ - (var) = TAILQ_NEXT((var), field)) - -#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = TAILQ_FIRST((head)); \ - (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ - for ((var) = TAILQ_LAST((head), headname); \ - (var); \ - (var) = TAILQ_PREV((var), headname, field)) - -#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ - for ((var) = TAILQ_LAST((head), headname); \ - (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ - (var) = (tvar)) - -#define TAILQ_INIT(head) do { \ - TAILQ_FIRST((head)) = NULL; \ - (head)->tqh_last = &TAILQ_FIRST((head)); \ - QMD_TRACE_HEAD(head); \ -} while (0) - -#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ - if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ - TAILQ_NEXT((elm), field)->field.tqe_prev = \ - &TAILQ_NEXT((elm), field); \ - else { \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_HEAD(head); \ - } \ - TAILQ_NEXT((listelm), field) = (elm); \ - (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ - QMD_TRACE_ELEM(&(elm)->field); \ - QMD_TRACE_ELEM(&listelm->field); \ -} while (0) - -#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ - (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ - TAILQ_NEXT((elm), field) = (listelm); \ - *(listelm)->field.tqe_prev = (elm); \ - (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_ELEM(&(elm)->field); \ - QMD_TRACE_ELEM(&listelm->field); \ -} while (0) - -#define TAILQ_INSERT_HEAD(head, elm, field) do { \ - if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ - TAILQ_FIRST((head))->field.tqe_prev = \ - &TAILQ_NEXT((elm), field); \ - else \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - TAILQ_FIRST((head)) = (elm); \ - (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ - QMD_TRACE_HEAD(head); \ - QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - -#define TAILQ_INSERT_TAIL(head, elm, field) do { \ - TAILQ_NEXT((elm), field) = NULL; \ - (elm)->field.tqe_prev = (head)->tqh_last; \ - *(head)->tqh_last = (elm); \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_HEAD(head); \ - QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - -#define TAILQ_LAST(head, headname) \ - (*(((struct headname *)((head)->tqh_last))->tqh_last)) - -#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) - -#define TAILQ_PREV(elm, headname, field) \ - (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) - -#define TAILQ_REMOVE(head, elm, field) do { \ - if ((TAILQ_NEXT((elm), field)) != NULL) \ - TAILQ_NEXT((elm), field)->field.tqe_prev = \ - (elm)->field.tqe_prev; \ - else { \ - (head)->tqh_last = (elm)->field.tqe_prev; \ - QMD_TRACE_HEAD(head); \ - } \ - *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ - TRASHIT((elm)->field.tqe_next); \ - TRASHIT((elm)->field.tqe_prev); \ - QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - - -#ifdef _KERNEL - -/* - * XXX insque() and remque() are an old way of handling certain queues. - * They bogusly assumes that all queue heads look alike. - */ - -struct quehead { - struct quehead *qh_link; - struct quehead *qh_rlink; -}; - -#ifdef __CC_SUPPORTS___INLINE - -static __inline void -insque(void *a, void *b) -{ - struct quehead *element = (struct quehead *)a, - *head = (struct quehead *)b; - - element->qh_link = head->qh_link; - element->qh_rlink = head; - head->qh_link = element; - element->qh_link->qh_rlink = element; -} - -static __inline void -remque(void *a) -{ - struct quehead *element = (struct quehead *)a; - - element->qh_link->qh_rlink = element->qh_rlink; - element->qh_rlink->qh_link = element->qh_link; - element->qh_rlink = 0; -} - -#else /* !__CC_SUPPORTS___INLINE */ - -void insque(void *a, void *b); -void remque(void *a); - -#endif /* __CC_SUPPORTS___INLINE */ - -#endif /* _KERNEL */ - -#ifdef __cplusplus -} -#endif - -#endif /* !_SYS_QUEUE_H_ */ - -// vim: tabstop=4 shiftwidth=4 expandtab: -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ diff --git a/sparsity/src/hashtable.c b/sparsity/src/hashtable.c deleted file mode 100755 index e39f829..0000000 --- a/sparsity/src/hashtable.c +++ /dev/null @@ -1,1014 +0,0 @@ -#include <sys/types.h> -#include <stdlib.h> -#include <stdio.h> -#include <limits.h> -#include <strings.h> -#include <sched.h> - -#include "bsd_queue.h" -#include "atomic_defs.h" -#include "hashtable.h" - -#define HT_KEY_EQUALS(_k1, _kl1, _k2, _kl2) \ - (((char *)(_k1))[0] == ((char *)(_k2))[0] && \ - (_kl1) == (_kl2) && \ - memcmp((_k1), (_k2), (_kl1)) == 0) - - -typedef struct _ht_item { - uint32_t hash; - char kbuf[32]; - void *key; - size_t klen; - void *data; - size_t dlen; - TAILQ_ENTRY(_ht_item) next; -} __attribute__((packed)) ht_item_t; - -typedef struct _ht_item_list { - TAILQ_HEAD(, _ht_item) head; -#ifdef THREAD_SAFE -#ifdef __MACH__ - OSSpinLock lock; -#else - pthread_spinlock_t lock; -#endif -#endif - size_t index; - TAILQ_ENTRY(_ht_item_list) iterator_next; -} __attribute__((packed)) ht_items_list_t; - -typedef struct { - TAILQ_HEAD(, _ht_item_list) head; -} __attribute__((packed)) ht_iterator_list_t; - -// NOTE : order here matters (and also numbering) -typedef enum { - HT_STATUS_CLEAR = 0, - HT_STATUS_WRITE = 1, - HT_STATUS_GROW = 2, - HT_STATUS_IDLE = 3, - HT_STATUS_READ = 4 -} __attribute__((packed)) ht_status_t; - -struct _hashtable_s { - size_t size; - size_t max_size; - size_t count; - ht_status_t status; - uint32_t seed; - ht_items_list_t **items; - ht_free_item_callback_t free_item_cb; - ht_iterator_list_t *iterator_list; -#ifdef THREAD_SAFE - pthread_mutex_t iterator_lock; -#endif -} __attribute__((packed)); - -typedef struct _ht_iterator_callback { - int (*cb)(); - void *user; - size_t count; - hashtable_t *table; -} ht_iterator_callback_t; - -typedef struct _ht_collector_arg { - linked_list_t *output; - size_t count; -} ht_collector_arg_t; - -static inline uint32_t -ht_hash_one_at_a_time(hashtable_t *table, const unsigned char *str, const ssize_t len) -{ - const unsigned char * const end = (const unsigned char *)str + len; - uint32_t hash = table->seed + len; - while (str < end) { - hash += *str++; - hash += (hash << 10); - hash ^= (hash >> 6); - } - hash += (hash << 3); - hash ^= (hash >> 11); - return (hash + (hash << 15)); -} - - -hashtable_t * -ht_create(size_t initial_size, size_t max_size, ht_free_item_callback_t cb) -{ - hashtable_t *table = (hashtable_t *)calloc(1, sizeof(hashtable_t)); - - if (table && ht_init(table, initial_size, max_size, cb) != 0) { - free(table); - return NULL; - } - - return table; -} - -int -ht_init(hashtable_t *table, - size_t initial_size, - size_t max_size, - ht_free_item_callback_t cb) -{ - table->size = initial_size > HT_SIZE_MIN ? initial_size : HT_SIZE_MIN; - table->max_size = max_size; - table->items = (ht_items_list_t **)calloc(table->size, sizeof(ht_items_list_t *)); - if (!table->items) - return -1; - - table->status = HT_STATUS_IDLE; - - ht_set_free_item_callback(table, cb); -#ifdef BSD - table->seed = arc4random()%UINT32_MAX; -#else - table->seed = random()%UINT32_MAX; -#endif - table->iterator_list = calloc(1, sizeof(ht_iterator_list_t)); - if (!table->iterator_list) { - free(table->items); - return -1; - } - TAILQ_INIT(&table->iterator_list->head); - - MUTEX_INIT(table->iterator_lock); - - return 0; -} - -void -ht_set_free_item_callback(hashtable_t *table, ht_free_item_callback_t cb) -{ - ATOMIC_SET(table->free_item_cb, cb); -} - -void -ht_clear(hashtable_t *table) -{ - while(!ATOMIC_CAS(table->status, HT_STATUS_IDLE, HT_STATUS_CLEAR)) - sched_yield(); - - MUTEX_LOCK(table->iterator_lock); - ht_items_list_t *tmplist, *list = NULL; - TAILQ_FOREACH_SAFE(list, &table->iterator_list->head, iterator_next, tmplist) { - SPIN_LOCK(list->lock); - - ht_item_t *item = NULL; - ht_item_t *tmp; - - TAILQ_FOREACH_SAFE(item, &list->head, next, tmp) { - TAILQ_REMOVE(&list->head, item, next); - if (table->free_item_cb) - table->free_item_cb(item->data); - if (item->key != item->kbuf) - free(item->key); - free(item); - ATOMIC_DECREMENT(table->count); - } - - table->items[list->index] = NULL; - TAILQ_REMOVE(&table->iterator_list->head, list, iterator_next); - SPIN_UNLOCK(list->lock); - SPIN_DESTROY(list->lock); - free(list); - } - MUTEX_UNLOCK(table->iterator_lock); - - ATOMIC_CAS(table->status, HT_STATUS_CLEAR, HT_STATUS_IDLE); -} - -void -ht_destroy(hashtable_t *table) -{ - ht_clear(table); - free(table->items); - MUTEX_DESTROY(table->iterator_lock); - free(table->iterator_list); - free(table); -} - -static inline void -ht_grow_table(hashtable_t *table) -{ - // if we are not able to change the status now, let's return. - // ht_grow_table() will be called again next time a new key has been set - if (!ATOMIC_CAS(table->status, HT_STATUS_IDLE, HT_STATUS_GROW)) - return; - - // extra check if the table has been already updated by another thread in the meanwhile - if (table->max_size && ATOMIC_READ(table->size) >= table->max_size) { - ATOMIC_CAS(table->status, HT_STATUS_GROW, HT_STATUS_IDLE); - return; - } - - ht_iterator_list_t *new_iterator_list = calloc(1, sizeof(ht_iterator_list_t)); - if (!new_iterator_list) { - ATOMIC_CAS(table->status, HT_STATUS_GROW, HT_STATUS_IDLE); - return; - } - - TAILQ_INIT(&new_iterator_list->head); - - size_t new_size = ATOMIC_READ(table->size) << 1; - - if (table->max_size && new_size > table->max_size) - new_size = table->max_size; - - ht_items_list_t **items_list = - (ht_items_list_t **)calloc(new_size, sizeof(ht_items_list_t *)); - - if (!items_list) { - free(new_iterator_list); - ATOMIC_CAS(table->status, HT_STATUS_GROW, HT_STATUS_IDLE); - return; - } - - ht_item_t *item = NULL; - - MUTEX_LOCK(table->iterator_lock); - - ht_items_list_t **old_items = table->items; - table->items = items_list; - ATOMIC_SET(table->size, new_size); - - ht_items_list_t *tmp, *list = NULL; - TAILQ_FOREACH_SAFE(list, &table->iterator_list->head, iterator_next, tmp) { - // NOTE : list->index is safe to access outside of the lock - ATOMIC_SET(old_items[list->index], NULL); - // now readers can't access this list anymore - SPIN_LOCK(list->lock); - - // move all the items from the old list to the new one - while((item = TAILQ_FIRST(&list->head))) { - ht_items_list_t *new_list = ATOMIC_READ(items_list[item->hash%new_size]); - if (!new_list) { - new_list = malloc(sizeof(ht_items_list_t)); - // XXX - if malloc fails here the table is irremediably corrupted - // so there is no point in handling the case. - // TODO : using an internal prealloc'd bufferpool would ensure - // us to always obtain a valid pointer here - TAILQ_INIT(&new_list->head); - SPIN_INIT(new_list->lock); - size_t index = item->hash%new_size; - ATOMIC_SET(items_list[index], new_list); - new_list->index = index; - TAILQ_INSERT_TAIL(&new_iterator_list->head, new_list, iterator_next); - } - TAILQ_REMOVE(&list->head, item, next); - TAILQ_INSERT_TAIL(&new_list->head, item, next); - - } - - // we can now unregister the list from the iterator and release it - TAILQ_REMOVE(&table->iterator_list->head, list, iterator_next); - SPIN_UNLOCK(list->lock); - SPIN_DESTROY(list->lock); - free(list); - } - - // swap the iterator list - free(table->iterator_list); - table->iterator_list = new_iterator_list; - MUTEX_UNLOCK(table->iterator_lock); - - ATOMIC_CAS(table->status, HT_STATUS_GROW, HT_STATUS_IDLE); - - free(old_items); - - //fprintf(stderr, "Done growing table\n"); -} - -static inline ht_items_list_t * -ht_get_list(hashtable_t *table, uint32_t hash) -{ - size_t index = hash%ATOMIC_READ(table->size); - - // first try updating the status assuming we are the first reader requesting - // access to the table - uint32_t status; - do { - status = ATOMIC_CAS_RETURN(table->status, HT_STATUS_IDLE, HT_STATUS_READ); - // NOTE : if some writer is accessing the table we need to wait, - // multiple readers (status greater than IDLE) are allowed. - // In the unlikely event that sched_yield() fails, we break the loop - // but we will still check if the status is valid and in case it's not - // (so the cas operation didn't succeed) we will synchronize again with - // the other threads - } while (status < HT_STATUS_IDLE && sched_yield() == 0); - - // if some other reader is running in a background thread and has already - // updated the status (so it's already greater than IDLE), let's take that - // into account and try incrementing the status value by one - while (status != HT_STATUS_IDLE && - !(status >= HT_STATUS_READ && ATOMIC_CAS(table->status, status, status + 1))) - { - // if we didn't succeed incrementing the status, maybe the other readers finished - // their job and it was already put back to IDLE - status = ATOMIC_CAS_RETURN(table->status, HT_STATUS_IDLE, HT_STATUS_READ); - if (status < HT_STATUS_IDLE) // some writer is accessing the table, we need to wait - sched_yield(); - } - - status++; // status now holds the value we have updated in table->status - - // we can now safely retrieve the list - ht_items_list_t *list = table->items[index]; - - // NOTE: it's important here to lock the list while the status - // has not been put back to idle yet (so writes can't happen) - if (list) - SPIN_LOCK(list->lock); - - // now let's update the status by decrementing it - // NOTE: if we are the last active reader it will go down to the idle state - do { - if (ATOMIC_CAS(table->status, status, status -1)) - break; - status = ATOMIC_CAS_RETURN(table->status, HT_STATUS_READ, HT_STATUS_IDLE); - } while (status > HT_STATUS_READ); - - // NOTE: the returned list is already locked - return list; -} - -static inline ht_items_list_t * -ht_set_list(hashtable_t *table, uint32_t hash) -{ - ht_items_list_t *list = malloc(sizeof(ht_items_list_t)); - if (!list) - return NULL; - - SPIN_INIT(list->lock); - TAILQ_INIT(&list->head); - SPIN_LOCK(list->lock); - - size_t index = hash%ATOMIC_READ(table->size); - list->index = index; - - while (!ATOMIC_CAS(table->status, HT_STATUS_IDLE, HT_STATUS_WRITE)) - sched_yield(); - - // NOTE: once the status has been set to WRITE no other threads can access the table - - index = hash%ATOMIC_READ(table->size); - list->index = index; - - // NOTE: since nobody could have changed the status in the meanwhile - if (table->items[index]) { - // if there is a list already set at our index it means that some other - // thread succeded in setting a new list already, completing its job before - // we were able to update the table status. - // So we can release our newly created list and return the existing one - SPIN_UNLOCK(list->lock); - SPIN_DESTROY(list->lock); - free(list); - list = table->items[index]; - SPIN_LOCK(list->lock); - ATOMIC_CAS(table->status, HT_STATUS_WRITE, HT_STATUS_IDLE); - return list; - } - - table->items[index] = list; - - // it's safe to assume the status still WRITE, - // so we don't need to check if the CAS operation succeeded - ATOMIC_CAS(table->status, HT_STATUS_WRITE, HT_STATUS_IDLE); - - MUTEX_LOCK(table->iterator_lock); - TAILQ_INSERT_TAIL(&table->iterator_list->head, list, iterator_next); - MUTEX_UNLOCK(table->iterator_lock); - - // NOTE: the newly created list is already locked - return list; -} - -static inline int -ht_set_internal(hashtable_t *table, - void *key, - size_t klen, - void *data, - size_t dlen, - void **prev_data, - size_t *prev_len, - int copy, - int inx) -{ - void *prev = NULL; - size_t plen = 0; - - if (!klen) - return -1; - - uint32_t hash = ht_hash_one_at_a_time(table, key, klen); - - // let's first try checking if we fall in an existing bucket list - ht_items_list_t *list = ht_get_list(table, hash); - - if (!list) // if not, let's create a new bucket list - list = ht_set_list(table, hash); - - if (!list) - return -1; - - ht_item_t *item = NULL; - TAILQ_FOREACH(item, &list->head, next) { - if (/*ht_item->hash == arg->item.hash && */ - HT_KEY_EQUALS(item->key, item->klen, key, klen)) - { - prev = item->data; - plen = item->dlen; - break; - } - } - - if (!prev) { - ht_item_t *item = (ht_item_t *)calloc(1, sizeof(ht_item_t)); - if (!item) { - //fprintf(stderr, "Can't create new item: %s\n", strerror(errno)); - SPIN_UNLOCK(list->lock); - return -1; - } - item->hash = hash; - item->klen = klen; - - if (klen > sizeof(item->kbuf)) { - item->key = malloc(klen); - if (!item->key) { - free(item); - SPIN_UNLOCK(list->lock); - return -1; - } - } else { - item->key = item->kbuf; - } - - memcpy(item->key, key, klen); - - if (copy) { - if (dlen) { - item->data = malloc(dlen); - if (!item->data) { - if (klen > sizeof(item->kbuf)) - free(item->key); - free(item); - SPIN_UNLOCK(list->lock); - return -1; - } - memcpy(item->data, data, dlen); - } else { - item->data = NULL; - } - } else { - item->data = data; - } - item->dlen = dlen; - - TAILQ_INSERT_TAIL(&list->head, item, next); - ATOMIC_INCREMENT(table->count); - } else { - if (inx) { - if (prev_data) - *prev_data = prev; - if (prev_len) - *prev_len = plen; - SPIN_UNLOCK(list->lock); - return 1; - } - item->dlen = dlen; - if (copy) { - void *dcopy = malloc(dlen); - if (!dcopy) { - SPIN_UNLOCK(list->lock); - return -1; - } - - item->data = dcopy; - memcpy(item->data, data, dlen); - } else { - item->data = data; - } - } - - SPIN_UNLOCK(list->lock); - - size_t current_size = ATOMIC_READ(table->size); - if (ht_count(table) > (current_size + (current_size/3)) && - (!table->max_size || current_size < table->max_size)) - { - ht_grow_table(table); - } - - if (prev) { - if (prev_data) - *prev_data = prev; - else if (table->free_item_cb) - table->free_item_cb(prev); - } else if (prev_data) { - *prev_data = NULL; - } - - if (prev_len) - *prev_len = plen; - - return 0; -} - -int -ht_set(hashtable_t *table, void *key, size_t klen, void *data, size_t dlen) -{ - return ht_set_internal(table, key, klen, data, dlen, NULL, NULL, 0, 0); -} - -int -ht_set_if_not_exists(hashtable_t *table, void *key, size_t klen, void *data, size_t dlen) -{ - return ht_set_internal(table, key, klen, data, dlen, NULL, NULL, 0, 1); -} - -int -ht_get_or_set(hashtable_t *table, - void *key, - size_t klen, - void *data, - size_t dlen, - void **cur_data, - size_t *cur_len) -{ - return ht_set_internal(table, key, klen, data, dlen, cur_data, cur_len, 0, 1); -} - -int -ht_get_and_set(hashtable_t *table, - void *key, - size_t klen, - void *data, - size_t dlen, - void **prev_data, - size_t *prev_len) -{ - return ht_set_internal(table, key, klen, data, dlen, prev_data, prev_len, 0, 0); -} - -int -ht_set_copy(hashtable_t *table, - void *key, - size_t klen, - void *data, - size_t dlen, - void **prev_data, - size_t *prev_len) -{ - return ht_set_internal(table, key, klen, data, dlen, prev_data, prev_len, 1, 0); -} - -static inline int -ht_call_internal(hashtable_t *table, - void *key, - size_t klen, - ht_pair_callback_t cb, - void *user) -{ - int ret = -1; - - uint32_t hash = ht_hash_one_at_a_time(table, key, klen); - - ht_items_list_t *list = ht_get_list(table, hash); - if (!list) - return ret; - - ht_item_t *item = NULL; - ht_item_t *tmp; - TAILQ_FOREACH_SAFE(item, &list->head, next, tmp) { - if (/*ht_item->hash == arg->item.hash && */ - HT_KEY_EQUALS(item->key, item->klen, key, klen)) - { - if (cb) { - ret = cb(table, key, klen, &item->data, &item->dlen, user); - if (ret == 1) { - TAILQ_REMOVE(&list->head, item, next); - if (item->key != item->kbuf) - free(item->key); - free(item); - ATOMIC_DECREMENT(table->count); - ret = 0; - } - } else { - ret = 0; - } - break; - } - } - - SPIN_UNLOCK(list->lock); - - return ret; -} - -int -ht_call(hashtable_t *table, - void *key, - size_t klen, - ht_pair_callback_t cb, - void *user) -{ - return ht_call_internal(table, key, klen, cb, user); -} - -typedef struct { - void *data; - size_t dlen; - void *match; - size_t match_size; - int matched; - void **prev_data; - size_t *prev_len; -} ht_set_if_equals_helper_arg_t; - -static int -ht_set_if_equals_helper(hashtable_t *table, void *key __attribute__ ((unused)), size_t klen __attribute__ ((unused)), void **value, size_t *vlen, void *user) -{ - ht_set_if_equals_helper_arg_t *arg = (ht_set_if_equals_helper_arg_t *)user; - - if (arg->prev_len) - *arg->prev_len = *vlen; - - if (arg->prev_data) - *arg->prev_data = *value; - - if (arg->match_size == *vlen && ((char *)*value)[0] == *((char *)arg->match) && - memcmp(*value, arg->match, arg->match_size) == 0) - { - arg->matched = 1; - - if (!arg->prev_data && table->free_item_cb) - table->free_item_cb(*value); - - *value = arg->data; - *vlen = arg->dlen; - } - - return 0; -} - -int -ht_set_if_equals(hashtable_t *table, - void *key, - size_t klen, - void *data, - size_t dlen, - void *match, - size_t match_size, - void **prev_data, - size_t *prev_len) -{ - if (!match && match_size == 0) - return ht_set_if_not_exists(table, key, klen, data, dlen); - - ht_set_if_equals_helper_arg_t arg = { - .data = data, - .dlen = dlen, - .match = match, - .match_size = match_size, - .matched = 0, - .prev_data = prev_data, - .prev_len = prev_len - }; - if (ht_call_internal(table, key, klen, ht_set_if_equals_helper, (void *)&arg) == 0) - { - return arg.matched ? 0 : 1; - } - return -1; -} - - -typedef struct -{ - int unset; - void **prev_data; - size_t *prev_len; - void *match; - size_t match_size; -} ht_delete_helper_arg_t; - -static int -ht_delete_helper(hashtable_t *table, void *key __attribute__ ((unused)), size_t klen __attribute__ ((unused)), void **value, size_t *vlen, void *user) -{ - ht_delete_helper_arg_t *arg = (ht_delete_helper_arg_t *)user; - - if (arg->match && (arg->match_size != *vlen || memcmp(arg->match, *value, *vlen) != 0)) - return -1; - - if (arg->prev_data) - *arg->prev_data = *value; - else if (table->free_item_cb) - table->free_item_cb(*value); - - if (arg->prev_len) - *arg->prev_len = *vlen; - - if (arg->unset) { - *vlen = 0; - *value = NULL; - return 0; - } - - return 1; // we want the item to be removed -} - -int -ht_unset(hashtable_t *table, - void *key, - size_t klen, - void **prev_data, - size_t *prev_len) -{ - ht_delete_helper_arg_t arg = { - .unset = 1, - .prev_data = prev_data, - .prev_len = prev_len, - .match = NULL, - .match_size = 0 - }; - - return ht_call_internal(table, key, klen, ht_delete_helper, (void *)&arg); -} - -static inline int -ht_delete_internal (hashtable_t *table, - void *key, - size_t klen, - void **prev_data, - size_t *prev_len, - void *match, - size_t match_size) -{ - - ht_delete_helper_arg_t arg = { - .unset = 0, - .prev_data = prev_data, - .prev_len = prev_len, - .match = match, - .match_size = match_size - }; - - return ht_call_internal(table, key, klen, ht_delete_helper, (void *)&arg); -} - -int -ht_delete (hashtable_t *table, - void *key, - size_t klen, - void **prev_data, - size_t *prev_len) -{ - return ht_delete_internal(table, key, klen, prev_data, prev_len, NULL, 0); -} - -int -ht_delete_if_equals(hashtable_t *table, void *key, size_t klen, void *match, size_t match_size) -{ - return ht_delete_internal(table, key, klen, NULL, NULL, match, match_size); -} - -int -ht_exists(hashtable_t *table, void *key, size_t klen) -{ - return (ht_call_internal(table, key, klen, NULL, NULL) == 0); -} - -typedef struct { - void *data; - size_t *dlen; - int copy; - ht_deep_copy_callback_t copy_cb; - void *user; -} ht_get_helper_arg_t; - -static int -ht_get_helper(hashtable_t *table __attribute__ ((unused)), void *key __attribute__ ((unused)), size_t klen __attribute__ ((unused)), void **value, size_t *vlen, void *user) -{ - ht_get_helper_arg_t *arg = (ht_get_helper_arg_t *)user; - - if (arg->copy) { - if (arg->copy_cb) { - arg->data = arg->copy_cb(*value, *vlen, arg->user); - } else { - arg->data = malloc(*vlen); - if (!arg->data) - return -1; - memcpy(arg->data, *value, *vlen); - } - } else { - arg->data = *value; - } - - if (arg->dlen) - *arg->dlen = *vlen; - - - return 0; -} - -static inline void * -ht_get_internal(hashtable_t *table, - void *key, - size_t klen, - size_t *dlen, - int copy, - ht_deep_copy_callback_t copy_cb, - void *user) -{ - ht_get_helper_arg_t arg = { - .data = NULL, - .dlen = dlen, - .copy = copy, - .copy_cb = copy_cb, - .user = user - }; - - ht_call_internal(table, key, klen, ht_get_helper, (void *)&arg); - - return arg.data; -} - -void * -ht_get(hashtable_t *table, void *key, size_t klen, size_t *dlen) -{ - return ht_get_internal(table, key, klen, dlen, 0, NULL, NULL); -} - -void * -ht_get_copy(hashtable_t *table, void *key, size_t klen, size_t *dlen) -{ - return ht_get_internal(table, key, klen, dlen, 1, NULL, NULL); -} - -void * -ht_get_deep_copy(hashtable_t *table, void *key, size_t klen, - size_t *dlen, ht_deep_copy_callback_t copy_cb, void *user) -{ - return ht_get_internal(table, key, klen, dlen, 1, copy_cb, user); -} - -static void -free_key(hashtable_key_t *key) -{ - free(key->data); - free(key); -} - -linked_list_t * -ht_get_all_keys(hashtable_t *table) -{ - linked_list_t *output = list_create(); - list_set_free_value_callback(output, (free_value_callback_t)free_key); - - MUTEX_LOCK(table->iterator_lock); - ht_items_list_t *list = NULL; - TAILQ_FOREACH(list, &table->iterator_list->head, iterator_next) { - SPIN_LOCK(list->lock); - - ht_item_t *item = NULL; - TAILQ_FOREACH(item, &list->head, next) { - hashtable_key_t *key = malloc(sizeof(hashtable_key_t)); - if (!key) { - SPIN_UNLOCK(list->lock); - MUTEX_UNLOCK(table->iterator_lock); - list_destroy(output); - return NULL; - } - key->data = malloc(item->klen); - if (!key->data) { - SPIN_UNLOCK(list->lock); - MUTEX_UNLOCK(table->iterator_lock); - free(key); - list_destroy(output); - return NULL; - } - memcpy(key->data, item->key, item->klen); - key->len = item->klen; - key->vlen = item->dlen; - list_push_value(output, key); - } - - SPIN_UNLOCK(list->lock); - } - MUTEX_UNLOCK(table->iterator_lock); - return output; -} - -linked_list_t * -ht_get_all_values(hashtable_t *table) -{ - linked_list_t *output = list_create(); - list_set_free_value_callback(output, (free_value_callback_t)free); - - MUTEX_LOCK(table->iterator_lock); - ht_items_list_t *list = NULL; - TAILQ_FOREACH(list, &table->iterator_list->head, iterator_next) { - SPIN_LOCK(list->lock); - - ht_item_t *item = NULL; - TAILQ_FOREACH(item, &list->head, next) { - hashtable_value_t *v = malloc(sizeof(hashtable_value_t)); - if (!v) { - SPIN_UNLOCK(list->lock); - MUTEX_UNLOCK(table->iterator_lock); - list_destroy(output); - return NULL; - } - v->data = item->data; - v->len = item->dlen; - v->key = item->key; - v->klen = item->klen; - list_push_value(output, v); - } - - SPIN_UNLOCK(list->lock); - } - MUTEX_UNLOCK(table->iterator_lock); - return output; -} - -typedef struct { - int (*cb)(); - void *user; -} ht_iterator_arg_t; - -static int -ht_foreach_key_helper(hashtable_t *table, void *key, size_t klen, void *value __attribute__ ((unused)), size_t vlen __attribute__ ((unused)), void *user) -{ - ht_iterator_arg_t *arg = (ht_iterator_arg_t *)user; - return arg->cb(table, key, klen, arg->user); -} - -void -ht_foreach_key(hashtable_t *table, ht_key_iterator_callback_t cb, void *user) -{ - ht_iterator_arg_t arg = { cb, user }; - ht_foreach_pair(table, ht_foreach_key_helper, &arg); -} - -static int -ht_foreach_value_helper(hashtable_t *table, void *key __attribute__ ((unused)), size_t klen __attribute__ ((unused)), void *value, size_t vlen, void *user) -{ - ht_iterator_arg_t *arg = (ht_iterator_arg_t *)user; - return arg->cb(table, value, vlen, arg->user); -} - -void -ht_foreach_value(hashtable_t *table, ht_value_iterator_callback_t cb, void *user) -{ - ht_iterator_arg_t arg = { cb, user }; - ht_foreach_pair(table, ht_foreach_value_helper, &arg); -} - -void -ht_foreach_pair(hashtable_t *table, ht_pair_iterator_callback_t cb, void *user) -{ - int rc = 0; - - - MUTEX_LOCK(table->iterator_lock); - ht_items_list_t *list = NULL; - TAILQ_FOREACH(list, &table->iterator_list->head, iterator_next) { - SPIN_LOCK(list->lock); - ht_item_t *item = NULL; - TAILQ_FOREACH(item, &list->head, next) { - rc = cb(table, item->key, item->klen, item->data, item->dlen, user); - if (rc <= 0) - break; - } - - if (item) { - if (rc == 0) { - SPIN_UNLOCK(list->lock); - break; - } else if (rc < 0) { - TAILQ_REMOVE(&list->head, item, next); - if (table->free_item_cb) - table->free_item_cb(item->data); - if (item->key != item->kbuf) - free(item->key); - free(item); - if (rc == -2) { - SPIN_UNLOCK(list->lock); - break; - } - } - } - SPIN_UNLOCK(list->lock); - } - MUTEX_UNLOCK(table->iterator_lock); -} - -size_t -ht_count(hashtable_t *table) -{ - return ATOMIC_READ(table->count); -} - -// vim: tabstop=4 shiftwidth=4 expandtab: -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ diff --git a/sparsity/src/hashtable.h b/sparsity/src/hashtable.h deleted file mode 100755 index 820a3bc..0000000 --- a/sparsity/src/hashtable.h +++ /dev/null @@ -1,409 +0,0 @@ -/** - * @file hashtable.h - * @author Andrea Guzzo - * @date 22/09/2013 - * @brief Fast thread-safe hashtable implementation - * @note In case of failures reported from the pthread interface - * abort() will be called. Callers can catch SIGABRT if more - * actions need to be taken. - */ -#ifndef HL_HASHTABLE_H -#define HL_HASHTABLE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include <stdint.h> -#include <stdbool.h> -#include "linklist.h" - -/** - * @brief Opaque structure representing the actual hash table descriptor - */ -typedef struct _hashtable_s hashtable_t; - -/** - * @brief Callback that, if provided, will be called to release the value resources - * when an item is being removed from the table - */ -typedef void (*ht_free_item_callback_t)(void *); - -#define HT_SIZE_MIN 128 - -/** - * @brief Create a new table descriptor - * @param initial_size : initial size of the table; if 0 HT_SIZE_MIN will be used as initial size - * @param max_size : maximum size the table can be grown up to - * @param free_item_cb : the callback to use when an item needs to be released - * @return a newly allocated and initialized table - * - * The table will be expanded if necessary - */ -hashtable_t *ht_create(size_t initial_size, size_t max_size, ht_free_item_callback_t free_item_cb); - -/** - * @brief Initialize a pre-allocated table descriptor - * - * This function can be used to initialize a statically defined table - * @return 0 on success; -1 otherwise - */ -int ht_init(hashtable_t *table, size_t initial_size, size_t max_size, ht_free_item_callback_t free_item_cb); - -/** - * @brief Set the callback which must be called to release values stored in the table - * @param table : A valid pointer to an hashtable_t structure - * @param cb : an ht_free_item_callback_t function - */ -void ht_set_free_item_callback(hashtable_t *table, ht_free_item_callback_t cb); - -/** - * @brief Clear the table by removing all the stored items - * @param table : A valid pointer to an hashtable_t structure - * - * If a free_item_callback has been set, that will be called for each item removed from the table - */ -void ht_clear(hashtable_t *table); - -/** - * @brief Destroy the table by releasing all its resources - * @param table : A valid pointer to an hashtable_t structure - * - * If a free_item_callback has been set, that will be called for each item removed from the table - */ -void ht_destroy(hashtable_t *table); - -/** - * @brief Get the value stored at a specific key - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param dlen : If not NULL, the size of the returned data will be stored - * at the address pointed by dlen - * @return The stored value if any, NULL otherwise - */ -void *ht_get(hashtable_t *table, void *key, size_t klen, size_t *dlen); - -/** - * @brief Check if a key exists in the hashtable - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @return 1 If the key exists, 0 if it doesn't exist and -1 in case of error - */ -int ht_exists(hashtable_t *table, void *key, size_t klen); - -/** - * @brief Get a copy of the value stored at a specific key - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param dlen : If not NULL, the size of the returned data will be stored - * at the address pointed by dlen - * @return The stored value if any, NULL otherwise - * @note The returned value is a copy (memcpy) of the stored value and the - * caller MUST release it using free() once done - * - * @note The copy is a simple copy done using memcpy() if the stored value - * is structured and requires a deep copy, then ht_get_deep_copy() - * should be used instead of this function - */ -void *ht_get_copy(hashtable_t *table, void *key, size_t klen, size_t *dlen); - -typedef void *(*ht_deep_copy_callback_t)(void *data, size_t dlen, void *user); - -/** - * @brief Get a copy of the value stored at a specific key - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param dlen : If not NULL, the size of the returned data will be stored - * at the address pointed by dlen - * @param copy_cb : The callback which will take care of deep-copying the data - * @param user : A private pointer which will be passed back to the copy_cb - * @return The stored value if any, NULL otherwise - * @note The returned value is eventually created by the deep_copy callback - * hence the caller knows if memory will need to be disposed or not and - * how to fully release the structured value which has been deep copied - */ -void *ht_get_deep_copy(hashtable_t *table, void *key, size_t klen, size_t *dlen, ht_deep_copy_callback_t copy_cb, void *user); - -/** - * @brief Set the value for a specific key - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param data : A pointer to the data to store - * @param dlen : The size of the data - * @return 0 on success, -1 otherwise - */ -int ht_set(hashtable_t *table, void *key, size_t klen, void *data, size_t dlen); - -/** - * @brief Set the value for a specific key and returns the previous value if any - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param data : A pointer to the data to store - * @param dlen : The size of the data - * @param prev_data : If not NULL, the referenced pointer will be set to point to the previous data - * @param prev_len : If not NULL, the size of the previous data will be stored in the memory - * pointed by prev_len - * @return 0 on success, -1 otherwise - * @note If prev_data is not NULL, the previous data will not be released using the free_value callback - * so the caller will be responsible of releasing the previous data once done with it - */ -int ht_get_and_set(hashtable_t *table, void *key, size_t klen, void *data, size_t dlen, void **prev_data, size_t *prev_len); - -/** - * @brief Get the value for a specific key or set a new value if none has been found - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param data : A pointer to the new data to store if none is found - * @param dlen : The size of the data to store - * @param cur_data : If not NULL, the referenced pointer will be set to point to the current data - * @param cur_len : If not NULL, the size of the current data will be stored in the memory - * pointed by cur_len - * @return 0 the value new value has been set successfully;\n - * 1 if a value was already set;\n - * -1 in case of errors - */ -int ht_get_or_set(hashtable_t *table, void *key, size_t klen, void *data, size_t dlen, void **cur_data, size_t *cur_len); - -/** - * @brief Set the value for a specific key and returns the previous value if any. - * - * The new value will be copied before being stored - * - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param data : A pointer to the data to store - * @param dlen : The size of the data - * @param prev_data : If not NULL, the referenced pointer will be set to point to the previous data - * @param prev_len : If not NULL, the size of the previous data will be stored in the memory - * pointed by prev_len - * @return The previous value if any, NULL otherwise - * @note If prev_data is not NULL, the previous data will not be released using the free_value callback - * so the caller will be responsible of releasing the previous data once done with it - */ -int ht_set_copy(hashtable_t *table, void *key, size_t klen, void *data, size_t dlen, void **prev_data, size_t *prev_len); - - -/** - * @brief Set the value for a specific key if there is no value already stored - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param data : A pointer to the data to store - * @param dlen : The size of the data - * @return 0 on success;\n - * 1 if a value was already set;\n - * -1 in case of errors - */ -int ht_set_if_not_exists(hashtable_t *table, void *key, size_t klen, void *data, size_t dlen); - -/** - * @brief Set a new value stored at a specific key only if the actual one matches some provided data - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param data : A pointer to the data to store - * @param dlen : The size of the data - * @param match : A valid pointer to the data we need to match in order to delete the value - * @param match_size : The value of the data to match - * @param prev_data : If not NULL the pointer will be set to point to the previous data - * @param prev_len : If not NULL the integer pointer will be set to the size of the previous data - * @node If the prev_data pointer is provided, the caller will be responsible of relasing - * the resources pointed after the call. If not provided (NULL) the free_value callback - * will be eventually used (if defined) - * @return 0 on success;\n - * 1 if the value didn't match (a the new value was not set), - * -1 in case of errors - */ -int ht_set_if_equals(hashtable_t *table, void *key, size_t klen, void *data, size_t dlen, void *match, size_t match_size, void **prev_data, size_t *prev_len); - -/** - * @brief Unset the value stored at a specific key - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param prev_data : If not NULL, the referenced pointer will be set to point to the previous data - * @param prev_len : If not NULL, the size of the previous data will be stored in the memory - * pointed by prev_len - * @return The previous value if any, NULL otherwise - * @note If prev_data is not NULL, the previous data will not be released using the free_value callback - * so the caller will be responsible of releasing the previous data once done with it - */ -int ht_unset(hashtable_t *table, void *key, size_t klen, void **prev_data, size_t *prev_len); - -/** - * @brief Delete the value stored at a specific key - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param prev_data : If not NULL, the referenced pointer will be set to point to the previous data - * @param prev_len : If not NULL, the size of the previous data will be stored in the memory - * pointed by prev_len - * @return 0 on success, -1 otherwise - * @note If prev_data is not NULL, the previous data will not be released using the free_value callback - * so the caller will be responsible of releasing the previous data once done with it - */ -int ht_delete(hashtable_t *table, void *key, size_t klen, void **prev_data, size_t *prev_len); - -/** - * @brief Delete the value stored at a specific key only if it matches some provided data - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param match : A valid pointer to the data we need to match in order to delete the value - * @param match_size : The value of the data to match - * @return 0 on success, -1 otherwise - */ -int ht_delete_if_equals(hashtable_t *table, void *key, size_t klen, void *match, size_t match_size); - -/** - * @brief Callback called if an item for a given key is found - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @return 0 on success - * 1 on success and the item must be removed from the hashtable - * -1 on error - */ -typedef int (*ht_pair_callback_t)(hashtable_t *table, void *key, size_t klen, void **value, size_t *vlen, void *user); - -/** - * @brief call the provided callback passing the item stored at the specified key (if any) - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key to use - * @param klen : The length of the key - * @param cb : The callback - * @param user : A private pointer which will be passed to the callback when invoked - * @note the callback is called while the bucket-level mutex is being retained - */ -int ht_call(hashtable_t *table, void *key, size_t klen, ht_pair_callback_t cb, void *user); - -/** - * @brief Return the count of items actually stored in the table - * @param table : A valid pointer to an hashtable_t structure - * @return The actual item count - */ -size_t ht_count(hashtable_t *table); - -// use the following two functions only if the hashtable_t contains -// a small number of keys, use the iterators otherwise - -typedef struct _hashtable_key_s { - void *data; - size_t len; - size_t vlen; -} hashtable_key_t; - -/** - * @brief Get all stored keys - * @param table : A valid pointer to an hashtable_t structure - * @return A list of hashtable_key_t pointers with all the - * keys present in the table - * @note The returned list should be released calling list_destroy() - */ -linked_list_t *ht_get_all_keys(hashtable_t *table); - -typedef struct _hashtable_value_s { - void *key; - size_t klen; - void *data; - size_t len; -} hashtable_value_t; - -/** - * @brief Get all stored values - * @param table : A valid pointer to an hashtable_t structure - * @return A list containing a pointer to all the values stored in the table - * @note The returned list will contain pointers to the actual stored values - * and not copies - * @note The returned list should be released calling list_destroy() - */ -linked_list_t *ht_get_all_values(hashtable_t *table); - -typedef enum { - HT_ITERATOR_STOP = 0, - HT_ITERATOR_CONTINUE = 1, - HT_ITERATOR_REMOVE = -1, - HT_ITERATOR_REMOVE_AND_STOP = -2 -} ht_iterator_status_t; - -/** - * @brief Callback for the key iterator - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key - * @param klen : The length of the key - * @param user : The user pointer passed as argument to the ht_foreach_pair() function - * @return HT_ITERATOR_CONTINUE to go ahead with the iteration, - * HT_ITERATOR_STOP to stop the iteration, - * HT_ITERATOR_REMOVE to remove the current item from the table and go ahead with the iteration - * HT_ITERATOR_REMOVE_AND_STOP to remove the current item from the table and stop the iteration - */ -typedef ht_iterator_status_t (*ht_key_iterator_callback_t)(hashtable_t *table, void *key, size_t klen, void *user); - -/** - * @brief Key iterator - * @param table : A valid pointer to an hashtable_t structure - * @param cb : an ht_key_iterator_callback_t function - * @param user : A pointer which will be passed to the iterator callback at each call - */ -void ht_foreach_key(hashtable_t *table, ht_key_iterator_callback_t cb, void *user); - -/** - * @brief Callback for the value iterator - * @param table : A valid pointer to an hashtable_t structure - * @param value : The value - * @param vlen : The length of the value - * @param user : The user pointer passed as argument to the ht_foreach_pair() function - * @return HT_ITERATOR_CONTINUE to go ahead with the iteration, - * HT_ITERATOR_STOP to stop the iteration, - * HT_ITERATOR_REMOVE to remove the current item from the table and go ahead with the iteration - * HT_ITERATOR_REMOVE_AND_STOP to remove the current item from the table and stop the iteration - */ -typedef ht_iterator_status_t (*ht_value_iterator_callback_t)(hashtable_t *table, void *value, size_t vlen, void *user); - -/** - * @brief Value iterator - * @param table : A valid pointer to an hashtable_t structure - * @param cb : an ht_value_iterator_callback_t function - * @param user : A pointer which will be passed to the iterator callback at each call - */ -void ht_foreach_value(hashtable_t *table, ht_value_iterator_callback_t cb, void *user); - -/** - * @brief Callback for the pair iterator - * @param table : A valid pointer to an hashtable_t structure - * @param key : The key - * @param klen : The length of the key - * @param value : The value - * @param vlen : The length of the value - * @param user : The user pointer passed as argument to the ht_foreach_pair() function - * @return HT_ITERATOR_CONTINUE to go ahead with the iteration, - * HT_ITERATOR_STOP to stop the iteration, - * HT_ITERATOR_REMOVE to remove the current item from the table and go ahead with the iteration - * HT_ITERATOR_REMOVE_AND_STOP to remove the current item from the table and stop the iteration - */ -typedef ht_iterator_status_t (*ht_pair_iterator_callback_t)(hashtable_t *table, void *key, size_t klen, void *value, size_t vlen, void *user); - -/** - * @brief Pair iterator - * @param table : A valid pointer to an hashtable_t structure - * @param cb : an ht_pair_iterator_callback_t function - * @param user : A pointer which will be passed to the iterator callback at each call - */ -void ht_foreach_pair(hashtable_t *table, ht_pair_iterator_callback_t cb, void *user); - -#ifdef __cplusplus -} -#endif - -#endif - -// vim: tabstop=4 shiftwidth=4 expandtab: -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ diff --git a/sparsity/src/linklist.c b/sparsity/src/linklist.c deleted file mode 100755 index 21ab76d..0000000 --- a/sparsity/src/linklist.c +++ /dev/null @@ -1,1267 +0,0 @@ -/* linked list management library - by xant - */ - -//#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <errno.h> - -#include "linklist.h" -#include "atomic_defs.h" - -typedef struct _list_entry_s { - struct _linked_list_s *list; - struct _list_entry_s *prev; - struct _list_entry_s *next; - void *value; - int tagged; -} list_entry_t; - -struct _linked_list_s { - list_entry_t *head; - list_entry_t *tail; - list_entry_t *cur; - size_t pos; - size_t length; -#ifdef THREAD_SAFE - pthread_mutex_t lock; -#endif - free_value_callback_t free_value_cb; - int refcnt; - list_entry_t *slices; -}; - -struct _slice_s { - linked_list_t *list; - size_t offset; - size_t length; -}; - -/******************************************************************** - * Entry-based API - * - Internal use only - ********************************************************************/ - -/* Entry creation and destruction routines */ -static inline list_entry_t *create_entry(); -static inline void destroy_entry(list_entry_t *entry); - -/* List and list_entry_t manipulation routines */ -static inline list_entry_t *pop_entry(linked_list_t *list); -static inline int push_entry(linked_list_t *list, list_entry_t *entry); -static inline int unshift_entry(linked_list_t *list, list_entry_t *entry); -static inline list_entry_t *shift_entry(linked_list_t *list); -static inline int insert_entry(linked_list_t *list, list_entry_t *entry, size_t pos); -static inline list_entry_t *pick_entry(linked_list_t *list, size_t pos); -static inline list_entry_t *fetch_entry(linked_list_t *list, size_t pos); -//list_entry_t *SelectEntry(linked_list_t *list, size_t pos); -static inline list_entry_t *remove_entry(linked_list_t *list, size_t pos); -static inline long get_entry_position(list_entry_t *entry); -static inline int move_entry(linked_list_t *list, size_t srcPos, size_t dstPos); -static inline list_entry_t *subst_entry(linked_list_t *list, size_t pos, list_entry_t *entry); -static inline int swap_entries(linked_list_t *list, size_t pos1, size_t pos2); - -/* - * Create a new linked_list_t. Allocates resources and returns - * a linked_list_t opaque structure for later use - */ -linked_list_t * -list_create() -{ - linked_list_t *list = (linked_list_t *)calloc(1, sizeof(linked_list_t)); - if(list) { - if (list_init(list) != 0) { - free(list); - return NULL; - } - } - return list; -} - -/* - * Initialize a preallocated linked_list_t pointed by list - * useful when using static list handlers - */ -int -list_init(linked_list_t *list __attribute__ ((unused))) -{ -#ifdef THREAD_SAFE - pthread_mutexattr_t attr; - if (pthread_mutexattr_init(&attr) != 0) { - return -1; - } - pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); - if (pthread_mutex_init(&list->lock, &attr) != 0) { - return -1; - } - pthread_mutexattr_destroy(&attr); -#endif - return 0; -} - -/* - * Destroy a linked_list_t. Free resources allocated for list - */ -void -list_destroy(linked_list_t *list) -{ - if(list) - { - while (list->slices) - slice_destroy(list->slices->value); - list_clear(list); -#ifdef THREAD_SAFE - MUTEX_DESTROY(list->lock); -#endif - free(list); - } -} - -static void -list_destroy_tagged_value_internal(tagged_value_t *tval, void (*free_cb)(void *v)) -{ - if(tval) - { - free(tval->tag); - if(tval->value) { - if(tval->type == TV_TYPE_LIST) - list_destroy((linked_list_t *)tval->value); - else if (free_cb) - free_cb(tval->value); - else if (tval->vlen) - free(tval->value); - } - free(tval); - } -} - -/* - * Clear a linked_list_t. Removes all entries in list - * if values are associated to entries, resources for those will not be freed. - * list_clear() can be used safely with entry-based and tagged-based api, - * otherwise you must really know what you are doing - */ -void -list_clear(linked_list_t *list) -{ - list_entry_t *e; - /* Destroy all entries still in list */ - while((e = shift_entry(list)) != NULL) - { - /* if there is a tagged_value_t associated to the entry, - * let's free memory also for it */ - if(e->tagged && e->value) - list_destroy_tagged_value_internal((tagged_value_t *)e->value, list->free_value_cb); - else if (list->free_value_cb) - list->free_value_cb(e->value); - - destroy_entry(e); - } -} - -/* Returns actual lenght of linked_list_t pointed by l */ -size_t -list_count(linked_list_t *l) -{ - size_t len; - MUTEX_LOCK(l->lock); - len = l->length; - MUTEX_UNLOCK(l->lock); - return len; -} - -void -list_set_free_value_callback(linked_list_t *list, free_value_callback_t free_value_cb) -{ - MUTEX_LOCK(list->lock); - list->free_value_cb = free_value_cb; - MUTEX_UNLOCK(list->lock); -} - -void -list_lock(linked_list_t *list __attribute__ ((unused))) -{ - MUTEX_LOCK(list->lock); -} - -void -list_unlock(linked_list_t *list __attribute__ ((unused))) -{ - MUTEX_UNLOCK(list->lock); -} - -/* - * Create a new list_entry_t structure. Allocates resources and returns - * a pointer to the just created list_entry_t opaque structure - */ -static inline -list_entry_t *create_entry() -{ - list_entry_t *new_entry = (list_entry_t *)calloc(1, sizeof(list_entry_t)); - /* - if (!new_entry) { - fprintf(stderr, "Can't create new entry: %s", strerror(errno)); - } - */ - return new_entry; -} - -/* - * Free resources allocated for a list_entry_t structure - * If the entry is linked in a list this routine will also unlink correctly - * the entry from the list. - */ -static inline void -destroy_entry(list_entry_t *entry) -{ - long pos; - if(entry) - { - if(entry->list) - { - /* entry is linked in a list...let's remove that reference */ - pos = get_entry_position(entry); - if(pos >= 0) - remove_entry(entry->list, pos); - } - free(entry); - } -} - -/* - * Pops a list_entry_t from the end of the list (or bottom of the stack - * if you are using the list as a stack) - */ -static inline -list_entry_t *pop_entry(linked_list_t *list) -{ - list_entry_t *entry; - MUTEX_LOCK(list->lock); - - entry = list->tail; - if(entry) - { - list->tail = entry->prev; - if(list->tail) - list->tail->next = NULL; - list->length--; - - entry->list = NULL; - entry->prev = NULL; - entry->next = NULL; - - if (list->cur == entry) - list->cur = NULL; - } - if(list->length == 0) - list->head = list->tail = NULL; - - MUTEX_UNLOCK(list->lock); - return entry; -} - -/* - * Pushs a list_entry_t at the end of a list - */ -static inline int -push_entry(linked_list_t *list, list_entry_t *entry) -{ - list_entry_t *p; - if(!entry) - return -1; - MUTEX_LOCK(list->lock); - if(list->length == 0) - { - list->head = list->tail = entry; - } - else - { - p = list->tail; - p->next = entry; - entry->prev = p; - entry->next = NULL; - list->tail = entry; - } - list->length++; - entry->list = list; - MUTEX_UNLOCK(list->lock); - return 0; -} - -/* - * Retreive a list_entry_t from the beginning of a list (or top of the stack - * if you are using the list as a stack) - */ -static inline -list_entry_t *shift_entry(linked_list_t *list) -{ - list_entry_t *entry; - MUTEX_LOCK(list->lock); - entry = list->head; - if(entry) - { - list->head = entry->next; - if(list->head) - list->head->prev = NULL; - list->length--; - - entry->list = NULL; - entry->prev = NULL; - entry->next = NULL; - - if (list->cur == entry) - list->cur = NULL; - else if (list->pos) - list->pos--; - } - if(list->length == 0) - list->head = list->tail = NULL; - MUTEX_UNLOCK(list->lock); - return entry; -} - - -/* - * Insert a list_entry_t at the beginning of a list (or at the top if the stack) - */ -static inline int -unshift_entry(linked_list_t *list, list_entry_t *entry) -{ - list_entry_t *p; - if(!entry) - return -1; - MUTEX_LOCK(list->lock); - if(list->length == 0) - { - list->head = list->tail = entry; - } - else - { - p = list->head; - p->prev = entry; - entry->prev = NULL; - entry->next = p; - list->head = entry; - } - list->length++; - entry->list = list; - if (list->cur) - list->pos++; - MUTEX_UNLOCK(list->lock); - return 0; -} - -/* - * Instert an entry at a specified position in a linked_list_t - */ -static inline int -insert_entry(linked_list_t *list, list_entry_t *entry, size_t pos) -{ - list_entry_t *prev, *next; - int ret = -1; - MUTEX_LOCK(list->lock); - if(pos == 0) { - ret = unshift_entry(list, entry); - } else if(pos == list->length) { - ret = push_entry(list, entry); - } else if (pos > list->length) { - unsigned int i; - for (i = list->length; i < pos; i++) { - list_entry_t *emptyEntry = create_entry(); - if (!emptyEntry || push_entry(list, emptyEntry) != 0) - { - if (emptyEntry) - destroy_entry(emptyEntry); - MUTEX_UNLOCK(list->lock); - return -1; - } - } - ret = push_entry(list, entry); - } - - if (ret == 0) { - MUTEX_UNLOCK(list->lock); - return ret; - } - - prev = pick_entry(list, pos-1); - if(prev) - { - next = prev->next; - prev->next = entry; - entry->prev = prev; - entry->next = next; - if (next) - next->prev = entry; - list->length++; - ret = 0; - } - MUTEX_UNLOCK(list->lock); - return ret; -} - -/* - * Retreive the list_entry_t at pos in a linked_list_t without removing it from the list - */ -static inline -list_entry_t *pick_entry(linked_list_t *list, size_t pos) -{ - unsigned int i; - list_entry_t *entry; - - MUTEX_LOCK(list->lock); - - if(list->length <= pos) { - MUTEX_UNLOCK(list->lock); - return NULL; - } - - size_t half_length = list->length >> 1; - /* we rely on integer underflow for the argument to abs(). */ - if (list->cur && (size_t)abs((int)(list->pos - pos)) < half_length) { - entry = list->cur; - if (list->pos != pos) { - if (list->pos < pos) { - for(i=list->pos; i < pos; i++) { - entry = entry->next; - } - } else if (list->pos > pos) { - for(i=list->pos; i > pos; i--) { - entry = entry->prev; - } - } - } - } else { - if (pos > half_length) - { - entry = list->tail; - for(i=list->length - 1;i>pos;i--) { - entry = entry->prev; - } - } - else - { - entry = list->head; - for(i=0;i<pos;i++) { - entry = entry->next; - } - } - } - if (entry) { - list->pos = pos; - list->cur = entry; - } - - MUTEX_UNLOCK(list->lock); - return entry; -} - -/* Retreive the list_entry_t at pos in a linked_list_t removing it from the list - * XXX - no locking here because this routine is just an accessor to other routines - * Caller MUST destroy the returned entry trough destroy_entry() call - */ -static inline -list_entry_t *fetch_entry(linked_list_t *list, size_t pos) -{ - list_entry_t *entry = NULL; - if(pos == 0 ) - return shift_entry(list); - else if(pos == list_count(list) - 1) - return pop_entry(list); - - entry = remove_entry(list, pos); - return entry; -} - -static inline int -move_entry(linked_list_t *list, size_t srcPos, size_t dstPos) -{ - list_entry_t *e; - - e = fetch_entry(list, srcPos); - if(e) - { - if(insert_entry(list, e, dstPos) == 0) - return 0; - else - { - if(insert_entry(list, e, srcPos) != 0) - { - //fprintf(stderr, "Can't restore entry at index %lu while moving to %lu\n", srcPos, dstPos); - } - } - } - /* TODO - Unimplemented */ - return -1; -} - -/* XXX - still dangerous ... */ -static inline int -swap_entries(linked_list_t *list, size_t pos1, size_t pos2) -{ - list_entry_t *e1; - list_entry_t *e2; - if(pos2 > pos1) - { - e2 = fetch_entry(list, pos2); - insert_entry(list, e2, pos1); - e1 = fetch_entry(list, pos1+1); - insert_entry(list, e1, pos2); - } - else if(pos1 > pos2) - { - e1 = fetch_entry(list, pos1); - insert_entry(list, e1, pos2); - e2 = fetch_entry(list, pos2+1); - insert_entry(list, e2, pos1); - } - else - return -1; - - /* TODO - Unimplemented */ - return 0; -} - -/* return old entry at pos */ -static inline -list_entry_t *subst_entry(linked_list_t *list, size_t pos, list_entry_t *entry) -{ - list_entry_t *old; - - MUTEX_LOCK(list->lock); - - old = fetch_entry(list, pos); - if(!old) { - MUTEX_UNLOCK(list->lock); - return NULL; - } - insert_entry(list, entry, pos); - - MUTEX_UNLOCK(list->lock); - /* XXX - NO CHECK ON INSERTION */ - return old; -} - -/* XXX - POSSIBLE RACE CONDITION BETWEEN pick_entry and the actual removal */ -static inline -list_entry_t *remove_entry(linked_list_t *list, size_t pos) -{ - list_entry_t *next, *prev; - list_entry_t *entry = pick_entry(list, pos); - MUTEX_LOCK(list->lock); - if(entry) - { - prev = entry->prev; - next = entry->next; - if (pos == 0) - list->head = next; - else if (pos == list->length - 1) - list->tail = prev; - - if(prev) - prev->next = next; - if(next) - next->prev = prev; - - list->length--; - entry->list = NULL; - entry->prev = NULL; - entry->next = NULL; - - if (list->cur == entry) { - list->cur = NULL; - list->pos = 0; - } else if (list->pos > pos) { - list->pos--; - } - MUTEX_UNLOCK(list->lock); - return entry; - } - MUTEX_UNLOCK(list->lock); - return NULL; -} - -/* return position of entry if linked in a list. - * Scans entire list so it can be slow for very long lists */ -long -get_entry_position(list_entry_t *entry) -{ - int i = 0; - linked_list_t *list; - list_entry_t *p; - list = entry->list; - - if (!list) - return -1; - - MUTEX_LOCK(list->lock); - if(list) - { - p = list->head; - while(p) - { - if(p == entry) { - MUTEX_UNLOCK(list->lock); - return i; - } - p = p->next; - i++; - } - } - MUTEX_UNLOCK(list->lock); - return -1; -} - -void * -list_pop_value(linked_list_t *list) -{ - void *val = NULL; - list_entry_t *entry = pop_entry(list); - if(entry) - { - val = entry->value; - destroy_entry(entry); - } - return val; -} - -int -list_push_value(linked_list_t *list, void *val) -{ - int res; - list_entry_t *new_entry = create_entry(); - if(!new_entry) - return -1; - new_entry->value = val; - res = push_entry(list, new_entry); - if(res != 0) - destroy_entry(new_entry); - return res; -} - -int -list_unshift_value(linked_list_t *list, void *val) -{ - int res; - list_entry_t *new_entry = create_entry(); - if(!new_entry) - return -1; - new_entry->value = val; - res = unshift_entry(list, new_entry); - if(res != 0) - destroy_entry(new_entry); - return res; -} - -void * -list_shift_value(linked_list_t *list) -{ - void *val = NULL; - list_entry_t *entry = shift_entry(list); - if(entry) - { - val = entry->value; - destroy_entry(entry); - } - return val; -} - -int -list_insert_value(linked_list_t *list, void *val, size_t pos) -{ - int res; - list_entry_t *new_entry = create_entry(); - if(!new_entry) - return -1; - new_entry->value = val; - res=insert_entry(list, new_entry, pos); - if(res != 0) - destroy_entry(new_entry); - return res; -} - -void * -list_pick_value(linked_list_t *list, size_t pos) -{ - list_entry_t *entry = pick_entry(list, pos); - if(entry) - return entry->value; - return NULL; -} - -void * -list_fetch_value(linked_list_t *list, size_t pos) -{ - void *val = NULL; - list_entry_t *entry = fetch_entry(list, pos); - if(entry) - { - val = entry->value; - destroy_entry(entry); - } - return val; -} - -/* just an accessor to move_entry */ -int -list_move_value(linked_list_t *list, size_t srcPos, size_t dstPos) -{ - return move_entry(list, srcPos, dstPos); -} - -void * -list_set_value(linked_list_t *list, size_t pos, void *newval) -{ - void *old_value = NULL; - MUTEX_LOCK(list->lock); - list_entry_t *entry = pick_entry(list, pos); - if (entry) { - old_value = entry->value; - entry->value = newval; - } else { - list_insert_value(list, newval, pos); - } - MUTEX_UNLOCK(list->lock); - return old_value; -} - -/* return old value at pos */ -void * -list_subst_value(linked_list_t *list, size_t pos, void *newval) -{ - void *old_value = NULL; - MUTEX_LOCK(list->lock); - list_entry_t *entry = pick_entry(list, pos); - if (entry) { - old_value = entry->value; - entry->value = newval; - } - MUTEX_UNLOCK(list->lock); - return old_value; -} - -int -list_swap_values(linked_list_t *list, size_t pos1, size_t pos2) -{ - return swap_entries(list, pos1, pos2); -} - -int -list_foreach_value(linked_list_t *list, int (*item_handler)(void *item, size_t idx, void *user), void *user) -{ - MUTEX_LOCK(list->lock); - slice_t slice = { - .list = list, - .offset = 0, - .length = list->length - }; - MUTEX_UNLOCK(list->lock); - return slice_foreach_value(&slice, item_handler, user); -} - -tagged_value_t * -list_create_tagged_value_nocopy(char *tag, void *val) -{ - tagged_value_t *newval = (tagged_value_t *)calloc(1, sizeof(tagged_value_t)); - if(!newval) { - //fprintf(stderr, "Can't create new tagged value: %s", strerror(errno)); - return NULL; - } - - if(tag) - newval->tag = strdup(tag); - if (val) - newval->value = val; - - return newval; -} - -/* - * Allocates resources for a new tagged_value_t initializing both tag and value - * to what received as argument. - * if vlen is 0 or negative, then val is assumed to be a string and - * strdup is used to copy it. - * Return a pointer to the new allocated tagged_value_t. - */ -tagged_value_t * -list_create_tagged_value(char *tag, void *val, size_t vlen) -{ - tagged_value_t *newval = (tagged_value_t *)calloc(1, sizeof(tagged_value_t)); - if(!newval) { - //fprintf(stderr, "Can't create new tagged value: %s", strerror(errno)); - return NULL; - } - - if(tag) - newval->tag = strdup(tag); - if(val) - { - if(vlen) - { - newval->value = malloc(vlen+1); - if(newval->value) - { - memcpy(newval->value, val, vlen); - memset((char *)newval->value+vlen, 0, 1); - newval->vlen = vlen; - } else { - //fprintf(stderr, "Can't copy value: %s", strerror(errno)); - free(newval->tag); - free(newval); - return NULL; - } - newval->type = TV_TYPE_BINARY; - } - else - { - newval->value = (void *)strdup((char *)val); - newval->vlen = strlen((char *)val); - newval->type = TV_TYPE_STRING; - } - } - return newval; -} - -/* - * Allocates resources for a new tagged_value_t - * containing a linked_list_t instead of a simple buffer. - * This let us define folded linked_list_t and therefore represent - * trees (or a sort of folded hashrefs) - */ -tagged_value_t * -list_create_tagged_sublist(char *tag, linked_list_t *sublist) -{ - tagged_value_t *newval = (tagged_value_t *)calloc(1, sizeof(tagged_value_t)); - if(!newval) { - //fprintf(stderr, "Can't create new tagged value: %s", strerror(errno)); - return NULL; - } - - if(tag) - newval->tag = strdup(tag); - newval->type = TV_TYPE_LIST; - newval->value = sublist; - return newval; -} - -/* Release resources for tagged_value_t pointed by tval */ -void -list_destroy_tagged_value(tagged_value_t *tval) -{ - list_destroy_tagged_value_internal(tval, NULL); -} - -tagged_value_t * -list_set_tagged_value(linked_list_t *list, char *tag, void *value, size_t len, int copy) -{ - int i; - - tagged_value_t *tval; - if (copy) - tval = list_create_tagged_value(tag, value, len); - else - tval = list_create_tagged_value_nocopy(tag, value); - - MUTEX_LOCK(list->lock); - for (i = 0; i < (int)list->length; i++) { - tagged_value_t *tv = list_pick_tagged_value(list, i); - if (tv && tv->tag && tv->tag[0] == tag[0] && - strcmp(tv->tag, tag) == 0) - { - MUTEX_UNLOCK(list->lock); - if (!list_set_value(list, i, tval)) { - list_destroy_tagged_value(tval); - return NULL; - } - return tv; - } - } - if (list_push_tagged_value(list, tval) == 0) { - list_destroy_tagged_value(tval); - tval = NULL; - } - MUTEX_UNLOCK(list->lock); - return NULL; -} - -/* Pops a tagged_value_t from the list pointed by list */ -tagged_value_t * -list_pop_tagged_value(linked_list_t *list) -{ - return (tagged_value_t *)list_pop_value(list); -} - -/* - * Pushes a new tagged_value_t into list. user must give a valid tagged_value_t pointer - * created trough a call to create_tagged_value() routine - */ -int -list_push_tagged_value(linked_list_t *list, tagged_value_t *tval) -{ - list_entry_t *new_entry; - int res = 0; - if(tval) - { - new_entry = create_entry(); - if(new_entry) - { - new_entry->tagged = 1; - new_entry->value = tval; - res = push_entry(list, new_entry); - if(res != 0) - destroy_entry(new_entry); - } - } - return res; -} - -int -list_unshift_tagged_value(linked_list_t *list, tagged_value_t *tval) -{ - int res = 0; - list_entry_t *new_entry; - if(tval) - { - new_entry = create_entry(); - if(new_entry) - { - new_entry->tagged = 1; - new_entry->value = tval; - res = unshift_entry(list, new_entry); - if(res != 0) - destroy_entry(new_entry); - } - } - return res; -} - -tagged_value_t * -shift_tagged_value(linked_list_t *list) -{ - return (tagged_value_t *)list_shift_value(list); -} - -int -list_insert_tagged_value(linked_list_t *list, tagged_value_t *tval, size_t pos) -{ - int res = 0; - list_entry_t *new_entry; - if(tval) - { - new_entry = create_entry(); - if(new_entry) - { - new_entry->tagged = 1; - new_entry->value = tval; - res = insert_entry(list, new_entry, pos); - if(res != 0) - destroy_entry(new_entry); - } - } - return res; -} - -tagged_value_t * -list_pick_tagged_value(linked_list_t *list, size_t pos) -{ - return (tagged_value_t *)list_pick_value(list, pos); -} - -tagged_value_t * -list_fetch_tagged_value(linked_list_t *list, size_t pos) -{ - return (tagged_value_t *)list_fetch_value(list, pos); -} - -/* - * ... without removing it from the list - */ -tagged_value_t * -list_get_tagged_value(linked_list_t *list, char *tag) -{ - int i; - tagged_value_t *tval; - for(i = 0;i < (int)list_count(list); i++) - { - tval = list_pick_tagged_value(list, i); - if (!tval) { - continue; - } - if(strcmp(tval->tag, tag) == 0) - return tval; - } - return NULL; -} - -/* - * ... without removing it from the list - * USER MUST NOT FREE MEMORY FOR RETURNED VALUES - * User MUST create a new list, pass it as 'values' - * and destroy it when no more needed .... entries - * returned inside the 'values' list MUST not be freed, - * because they reference directly the real entries inside 'list'. - */ -size_t -list_get_tagged_values(linked_list_t *list, char *tag, linked_list_t *values) -{ - int i; - int ret; - tagged_value_t *tval; - ret = 0; - for(i = 0;i < (int)list_count(list); i++) - { - tval = list_pick_tagged_value(list, i); - if (!tval) { - continue; - } - if(strcmp(tval->tag, tag) == 0) - { - list_push_value(values, tval->value); - ret++; - } - } - return ret; -} - -static inline void -swap_entry_node_val(list_entry_t *p1, list_entry_t *p2) -{ - if (!p1 || !p2) return; - - void *tmp = p1->value; - p1->value = p2->value; - p2->value = tmp; -} - -static inline void -list_quick_sort(list_entry_t *head, - list_entry_t *tail, - list_entry_t *pivot, - int length, - list_comparator_callback_t comparator) -{ - if (!head || !tail || !pivot || length < 2 || !comparator) return; - - if (length == 2) { - if (comparator(head->value, tail->value) < 0) - swap_entry_node_val(head, tail); - return; - } - - void *pvalue = pivot->value; - list_entry_t *p1 = head, *p2 = tail; - - for (;;) { - - while(p1 && p1 != pivot && comparator(p1->value, pvalue) > 0) - p1 = p1->next; - - while(p2 && p2 != pivot && comparator(p2->value, pvalue) < 0) - p2 = p2->prev; - - if (p1 == p2 || !p1 || !p2) - break; - - if (p1 == pivot) { - // all the elements on the left of the pivot are smaller - // so we can't just swap values anymore - if (p2->prev) - p2->prev->next = p2->next; - if (p2->next) - p2->next->prev = p2->prev; - - if (pivot->prev) - pivot->prev->next = p2; - else if (pivot == pivot->list->head) - pivot->list->head = p2; - - if (p2 == pivot->list->tail) - pivot->list->tail = p2->prev; - - list_entry_t *tmp = p2->prev; - p2->prev = pivot->prev; - pivot->prev = p2; - if (p2->prev) - p2->prev->next = p2; - - p2->next = pivot; - if (p2->next == head) - head = p2; - if (p2 == tail) - tail = tmp; - p2 = tmp; - - if (p1 != pivot) - p1 = p1->next; - - - } else if (p2 == pivot) { - // all the elements on the right of the pivot are bigger - // so we can't just swap values anymore - if (p1->prev) - p1->prev->next = p1->next; - if (p1->next) - p1->next->prev = p1->prev; - - if (pivot->next) - pivot->next->prev = p1; - else if (pivot == pivot->list->tail) - pivot->list->tail = p1; - - if (p1 == pivot->list->head) - pivot->list->head = p1->next; - - list_entry_t *tmp = p1->next; - p1->next = pivot->next; - pivot->next = p1; - if (p1->next) - p1->next->prev = p1; - - p1->prev = pivot; - if (p1->prev == tail) - tail = p1; - if (p1 == head) - head = tmp; - p1 = tmp; - - if (p2 != pivot) - p2 = p2->prev; - - } else { - swap_entry_node_val(p1, p2); - - if (p1 != pivot) - p1 = p1->next; - if (p2 != pivot) - p2 = p2->prev; - } - - } - - // TODO - optimize the pivot selection on the sublists - // (it could be done while traversing the list - // earlier in this function) - int l1 = 0; - p1 = head; - while (p1 != pivot) { - p1 = p1->next; - l1++; - } - int l2 = length - (l1 + 1); - int i; - list_entry_t *pv1 = head, *pv2 = tail; - for (i = 0; pv1 && pv1->next && i < l1/2; ++i) - pv1 = pv1->next; - for (i = 0; pv2 && pv2->prev && i < l2/2; ++i) - pv2 = pv2->prev; - - // recursion here - if (l1 > 1 && pivot->prev && head != pivot->prev) - list_quick_sort(head, pivot->prev, pv1, l1, comparator); - if (l2 > 1 && pivot->next && tail != pivot->next) - list_quick_sort(pivot->next, tail, pv2, l2, comparator); -} - -void -list_sort(linked_list_t *list, list_comparator_callback_t comparator) -{ - MUTEX_LOCK(list->lock); - list_entry_t *pivot = pick_entry(list, (list->length/2) - 1); - list_quick_sort(list->head, list->tail, pivot, list->length, comparator); - list->cur = NULL; - list->pos = 0; - MUTEX_UNLOCK(list->lock); -} - -slice_t * -slice_create(linked_list_t *list, size_t offset, size_t length) -{ - slice_t *slice = calloc(1, sizeof(slice_t)); - slice->list = list; - slice->offset = offset; - slice->length = length; - list_entry_t *e = create_entry(); - e->value = slice; - list_entry_t *cur = list->slices; - if (!cur) { - list->slices = e; - } else { - while (cur->next) - cur = cur->next; - cur->next = e; - e->prev = cur; - } - - return slice; -} - -void -slice_destroy(slice_t *slice) -{ - linked_list_t *list = slice->list; - list_entry_t *cur = list->slices; - list_entry_t *prev = NULL; - while (cur) { - if (cur->value == slice) { - if (prev) { - prev->next = cur->next; - cur->next->prev = prev; - } else { - list->slices = cur->next; - } - destroy_entry(cur); - break; - } - prev = cur; - cur = cur->next; - } - free(slice); -} - -int -slice_foreach_value(slice_t *slice, int (*item_handler)(void *item, size_t idx, void *user), void *user) -{ - linked_list_t *list = slice->list; - MUTEX_LOCK(list->lock); - size_t idx = 0; - list_entry_t *e = pick_entry(list, slice->offset); - while(e && idx < slice->length) { - int rc = item_handler(e->value, idx++, user); - if (rc == 0) { - break; - } else if (rc == -1 || rc == -2) { - list_entry_t *d = e; - e = e->next; - if (list->head == list->tail && list->tail == d) { - list->head = list->tail = NULL; - } else if (d == list->head) { - list->head = d->next; - list->head->prev = NULL; - } else if (d == list->tail) { - list->tail = d->prev; - list->tail->next = NULL; - } else { - e->prev = d->prev; - e->prev->next = e; - } - d->list = NULL; - if (list->cur == d) - list->cur = NULL; - list->length--; - slice->length--; - // the callback got the value and will take care of releasing it - destroy_entry(d); - if (rc == -2) // -2 means : remove and stop the iteration - break; - // -1 instead means that we still want to remove the item - // but we also want to go ahead with the iteration - } else { - e = e->next; - } - } - MUTEX_UNLOCK(list->lock); - return idx; -} - -// vim: tabstop=4 shiftwidth=4 expandtab: -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ diff --git a/sparsity/src/linklist.h b/sparsity/src/linklist.h deleted file mode 100755 index 1f8d7de..0000000 --- a/sparsity/src/linklist.h +++ /dev/null @@ -1,423 +0,0 @@ -/** - * @file linklist.h - * @author Andrea Guzzo - * @date 22/09/2013 - * @brief Fast thread-safe linklist implementation - * @note In case of failures reported from the pthread interface - * abort() will be called. Callers can catch SIGABRT if more - * actions need to be taken. - */ -#ifndef HL_LINKLIST_H -#define HL_LINKLIST_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include <stdlib.h> -#include <stdint.h> -#include <sys/types.h> -#ifdef WIN32 -#ifdef THREAD_SAFE -#include <w32_pthread.h> -#endif -#endif -#include <string.h> // for memset - -/** - * @brief Callback that, if provided, will be called to release the value resources - * when an item is being removed from the list - */ -typedef void (*free_value_callback_t)(void *v); - -typedef int (*list_comparator_callback_t)(void *v1, void *v2); - -/** - * @brief Opaque structure representing the actual linked list descriptor - */ -typedef struct _linked_list_s linked_list_t; - - -/******************************************************************** - * Common API - ********************************************************************/ - -/* List creation and destruction routines */ - -/** - * @brief Create a new list - * @return a newly allocated and initialized list - */ -linked_list_t *list_create(); - -/** - * @brief Initialize a pre-allocated list - * - * This function can be used to initialize a statically defined list - * @return 0 on success; -1 otherwise - */ -int list_init(linked_list_t *list); - -/** - * @brief Release all resources related to the list - * @param list : A valid pointer to a linked_list_t structure - */ -void list_destroy(linked_list_t *list); - -/** - * @brief remove all items from the list - * @param list : A valid pointer to a linked_list_t structure - */ -void list_clear(linked_list_t *list); - -/** - * @brief Return the total count of items in the list - * @param list : A valid pointer to a linked_list_t structure - * @return the actual number of items stored in the list - */ -size_t list_count(linked_list_t *list); - -/** - * @brief Set the callback which must be called to release values stored in the list - * @param list : A valid pointer to a linked_list_t structure - * @param free_value_cb : an free_value_callback_t function - */ -void list_set_free_value_callback(linked_list_t *list, free_value_callback_t free_value_cb); - -/** - * @brief Lock the list - * @param list : A valid pointer to a linked_list_t structure - */ -void list_lock(linked_list_t *list); - -/** - * @brief Unlock the list - * @param list : A valid pointer to a linked_list_t structure - */ -void list_unlock(linked_list_t *list); - -/******************************************************************** - * Value-based API - ********************************************************************/ - - -/* List access routines */ - -/** - * @brief Remove last value from the list - * @param list : A valid pointer to a linked_list_t structure - * @return The value previous tail of the list - */ -void *list_pop_value(linked_list_t *list); - -/** - * @brief Append a new value to the list (tail) - * @param list : A valid pointer to a linked_list_t structure - * @param val : The value to store in the tail of the list - * @return : 0 if success, -1 otherwise - */ -int list_push_value(linked_list_t *list, void *val); - -/** - * @brief Insert a new value at the beginning of the least (head) - * @param list : A valid pointer to a linked_list_t structure - * @param val : The value to store in the head of the list - * @return : 0 if success, -1 otherwise - */ -int list_unshift_value(linked_list_t *list, void *val); - -/** - * @brief Remove the first value from the list - * @param list : A valid pointer to a linked_list_t structure - * @return The previous value stored in the tail of the list - */ - -void *list_shift_value(linked_list_t *list); - -/** - * @brief Insert a value at a specific position - * @param list : A valid pointer to a linked_list_t structure - * @param val : The value to store at pos - * @param pos : The position (offset) where to store the value - * @return 0 if success, -1 otherwise - * - * If the list is shorter than pos-1 empty values will be inserted up to - * that position before inserting the new one - */ -int list_insert_value(linked_list_t *list, void *val, size_t pos); - - -/** - * @brief Set the value at a specific position - * @param list : A valid pointer to a linked_list_t structure - * @param pos : The position (offset) where to store the value - * @param val : The value to store at pos - * - * This function will replace the value at pos if present or insert it if missing - * filling in the gaps with NULL values if the length of the list is shorter than pos - */ -void *list_set_value(linked_list_t *list, size_t pos, void *val); - -/** - * @brief Replace the value stored at a specific position with a new value - * @param list : A valid pointer to a linked_list_t structure - * @param pos : The position of the value we want to replace - * @param val : The new value - */ -void *list_subst_value(linked_list_t *list, size_t pos, void *val); - - -/** - * @brief Pick the value at a specific position - * @param list : A valid pointer to a linked_list_t structure - * @param pos : The position (offset) of the requested value - * @return : The value stored at pos if any, NULL otherwise - * - * Note this is a read-only access and the value will not be removed from the list - */ -void *list_pick_value(linked_list_t *list, size_t pos); - -/** - * @brief Fetch (aka: Pick and Remove) the value at a specific position - * @param list : A valid pointer to a linked_list_t structure - * @param pos : The position (offset) of the requested value - * @return : The value stored at pos if any, NULL otherwise - * - * Note this is a read-write access and the value will be removed from the list before returning it. - * The value will not be released so the free_value_callback won't be called in this case - */ -void *list_fetch_value(linked_list_t *list, size_t pos); - -/** - * @brief Move an existing value to a new position - * @param list : A valid pointer to a linked_list_t structure - * @param srcPos : The actual position of the value we want to move - * @param dstPos : The new position where to move the value to - * @return : 0 if success, -1 otherwise - */ -int list_move_value(linked_list_t *list, size_t srcPos, size_t dstPos); - -/** - * @brief Swap two values - * @param list : A valid pointer to a linked_list_t structure - * @param pos1 : The position of the first value to swap with a second one - * @param pos2 : The position of the second value to swap with the first - * @return 0 if success, -1 otherwise - */ -int list_swap_values(linked_list_t *list, size_t pos1, size_t pos2); - - -/** - * @brief Callback for the value iterator - * @return 1 to go ahead with the iteration, - * 0 to stop the iteration, - * -1 to remove the current item from the list and go ahead with the iteration - * -2 to remove the current item from the list and stop the iteration - */ -typedef int (*item_handler_t)(void *item, size_t idx, void *user); - -/* list iterator. This iterator can be used for both Tag-based and Value-based lists. - * If tagged, items can simply be casted to a tagged_value_t pointer. - * @return The number of items visited during the iteration - */ -int list_foreach_value(linked_list_t *list, item_handler_t item_handler, void *user); - -/******************************************************************** - * Tag-based API - ********************************************************************/ - -/** - * @brief Tagged Value - * - * This structure represent a tagged_value_t and is the main datatype - * you will have to handle when workin with the tagged-based api. - * If user extract such structure from the list (removing it from the list) - * then he MUST release its resources trough a call to destroy_tagged_value - * when finished using it. - * If a new tagged_value must be created and inserted in a list, then - * list_create_tagged_value() should be used to allocate resources and obtain - * a pointer to a tagged_value_t structure. - */ -typedef struct _tagged_value_s { - char *tag; - void *value; - size_t vlen; - char type; -#define TV_TYPE_STRING 0 -#define TV_TYPE_BINARY 1 -#define TV_TYPE_LIST 2 -} tagged_value_t; - - -/* List creation and destruction routines */ - -/* Tagged List access routines (same of previous but with tag support */ -/** - * @brief Allocate resources for a new tagged value - * @param tag : The tag - * @param val : The value - * @param len : The size of the value - * @return a newly created tagged value with the provided tag and value - * - * Both the tag and the value will be copied. len will be the size used by the copy - */ -tagged_value_t *list_create_tagged_value(char *tag, void *val, size_t len); - -/** - * @brief Allocate resources for a new tagged value without copying the value - * @param tag : The tag - * @param val : The value - * @return A newly created tagged value with the provided tag and value - * - * Only the tag will be copied, the value will just point - * to the provided value without it being copied - */ -tagged_value_t *list_create_tagged_value_nocopy(char *tag, void *val); - -/** - * @brief Create a tagged value where the value is a linked_list_t - * @param tag : The tag - * @param list : The list used as value - * @return A newly created tagged value with type TV_TYPE_LIST - * - * This function is just an accessor to set the tagged_value->type properly - * when using it to store a list - */ -tagged_value_t *list_create_tagged_sublist(char *tag, linked_list_t *list); - -/** - * @brief Release resources used by the tagged value tval - * @param tval : The tagged value to release - */ -void list_destroy_tagged_value(tagged_value_t *tval); - -/** - * @brief Same as pop_value but expect the value to be a pointer to a tagged_value_t structure - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @return The tagged value stored at the end of the list - */ -tagged_value_t *list_pop_tagged_value(linked_list_t *list); - -/** - * @brief Same as push_value but when using the list to store tagged values - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @param tval: The new tagged value to store - * @return 0 if success, -1 otherwise - */ -int list_push_tagged_value(linked_list_t *list, tagged_value_t *tval); - -/** - * @brief Same as unshift_value but when using the list to store tagged values - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @param tval: The new tagged value to store - * @return 0 if success, -1 otherwise - */ -int list_unshift_tagged_value(linked_list_t *list, tagged_value_t *tval); - -/** - * @brief Same as shift_value but when using the list to store tagged values - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @return The tagged value stored in the head of the list, NULL if the list is empty - */ -tagged_value_t *list_shift_tagged_value(linked_list_t *list); - -/** - * @brief Same as insert_value but when using the list to store tagged values - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @param tval: The new tagged value to store - * @param pos: The position (index) where to store the new tagged value - * @return 0 if success, -1 otherwise - */ -int list_insert_tagged_value(linked_list_t *list, tagged_value_t *tval, size_t pos); - -/** - * @brief Same as pick_value but when using the list to store tagged values - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @param pos : The position (offset) of the requested tagged value - * @return : The tagged value stored at pos if any, NULL otherwise - * - * Note this is a read-only access and the tagged value will not be removed from the list - */ -tagged_value_t *list_pick_tagged_value(linked_list_t *list, size_t pos); - -/** - * @brief Same as fetch_value but when using the list to store tagged values - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @param pos : The position (offset) of the requested tagged value - * @return : The tagged value stored at pos if any, NULL otherwise - * - * Note this is a read-write access and the tagged value will be removed from - * the list before returning it. - * The tagged value will not be released - */ -tagged_value_t *list_fetch_tagged_value(linked_list_t *list, size_t pos); - -/** - * @brief Get a tagged value from the list by using its tag instead of the position - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @param tag : The tag of the value we are looking for - * @return The first tagged value in the list whose tag matches the provided tag - * - * Note this is a read-only access and the tagged value will not be removed from the list - */ -tagged_value_t *list_get_tagged_value(linked_list_t *list, char *tag); - -/** - * @brief Set a new tagged value in the list. If the list already - * contains values with the same tag, the first occurrence will be replaced with the new value - * (but still at the same index in the list) - * @param list: The list used as value - * @param tval: The new tagged value to insert to the list - * @return The previous tagged_value_t matching the given tag if any; NULL otherwise - * @note If a tagged value with the same tag is already contained in the list, - * this function will replace the old tagged_value_t structure with the - * new one preserving the position in the list.\n - * If no matching tagged_value_t structure is found, then the new one - * is added to the end of the list - */ -tagged_value_t *list_set_tagged_value(linked_list_t *list, char *tag, void *value, size_t len, int copy); - - -/** - * @brief Get all value pointers for all tagged values matching a specific tag - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @param tag : The tag of the values we are looking for - * @param values : a valid pointer to a linked_list_t structure where to put the - * value pointers held by the tagged_value_t items matching the provided tag - * @return The number of tagged values matching the tag and added to the values linked list - * - * Note The caller MUST NOT release resources for the returned values - * (since still pointed by the tagged_value_t still in list) - */ -size_t list_get_tagged_values(linked_list_t *list, char *tag, linked_list_t *values); - -/** - * @brief Sort the content of the list using an in-place quicksort algorithm and a - * provided callback able to compare the value stored in the list - * @param list : A valid pointer to a linked_list_t structure holding tagged values - * @param comparator : A valid list_comparator_callback_t callback able to compare the - * actual value stored in the list - */ -void list_sort(linked_list_t *list, list_comparator_callback_t comparator); - - -/******************************************************************** - * Slice API - ********************************************************************/ - -typedef struct _slice_s slice_t; - -slice_t *slice_create(linked_list_t *list, size_t offset, size_t length); - -void slice_destroy(slice_t *slice); - -int slice_foreach_value(slice_t *slice, item_handler_t item_handler, void *user); - -#ifdef __cplusplus -} -#endif - -#endif - -// vim: tabstop=4 shiftwidth=4 expandtab: -/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ diff --git a/sparsity/src/main.c b/sparsity/src/main.c deleted file mode 100644 index e588424..0000000 --- a/sparsity/src/main.c +++ /dev/null @@ -1,54 +0,0 @@ -// -// main.c -// traildb_to_sparse -// -// Created by Alan Höng on 19/02/2017. -// Copyright © 2017 Alan Höng. All rights reserved. -// - -#include <stdio.h> -#include <traildb.h> -#include "traildb_coo.h" -#include <inttypes.h> - -int main(int argc, const char * argv[]) { - tdb_error err; - const char * db_path = argv[1]; - tdb* db = tdb_init(); - - printf("%s\n", db_path); - if ((err = tdb_open(db, db_path))){ - printf("Opening TrailDB failed: %s\n", tdb_error_str(err)); - exit(1); - } - - uint64_t num_events = tdb_num_events(db); - - tdb_close(db); - - uint64_t *row_idx_array = malloc(sizeof(uint64_t) * num_events); - uint64_t *col_idx_array = malloc(sizeof(uint64_t) * num_events); - uint8_t **uids = malloc(sizeof(uint8_t*) * num_events); - uint64_t *timestamps = malloc(sizeof(uint64_t) * num_events); - - traildb_coo_repr(db_path, "username", row_idx_array, col_idx_array, uids, timestamps); - - int i; - for (i=0; i < num_events; i++){ - char str[37] = {}; - - printf("row_idx: %" PRIu64 "\n", row_idx_array[i]); - printf("col_idx: %" PRIu64 "\n", col_idx_array[i]); - printf("ts: %" PRIu64 "\n", timestamps[i]); - } - - //free data - free(row_idx_array); - free(col_idx_array); - for(i=0; i < num_events; i++){ - free(uids[i]); - } - free(uids); - free(timestamps); - return 0; -} diff --git a/sparsity/src/traildb_coo.c b/sparsity/src/traildb_coo.c deleted file mode 100644 index 6dbbd5f..0000000 --- a/sparsity/src/traildb_coo.c +++ /dev/null @@ -1,115 +0,0 @@ -// -// traildb_coo.c -// traildb_to_sparse -// -// Created by Alan Höng on 19/02/2017. -// Copyright © 2017 Alan Höng. All rights reserved. -// -#include "hashtable.h" -#include "traildb_coo.h" -#include <inttypes.h> - -uint64_t traildb_coo_repr(const char* fname, const char* fieldname, - uint64_t* row_idx_array, uint64_t* col_idx_array, - uint8_t* uids, uint64_t* timestamps, - char** col_names, uint64_t** str_lens){ - int summed = 0; - tdb_error err; - const char * db_path = fname; - tdb* db = tdb_init(); - - - printf("%s\n", db_path); - if ((err = tdb_open(db, db_path))){ - printf("Opening TrailDB failed: %s\n", tdb_error_str(err)); - exit(1); - } - - tdb_field oh_field; - if (( err = tdb_get_field(db, fieldname, &oh_field))){ - printf("Could not find field: %s\n", tdb_error_str(err)); - exit(1); - } - - uint64_t n_columns = tdb_lexicon_size(db, oh_field); - hashtable_t *col_mapping = ht_create(n_columns, n_columns, free); - linked_list_t* cols = list_create(); - linked_list_t* col_len = list_create(); - - uint64_t max_col_idx = 0; - - tdb_cursor *cursor = tdb_cursor_new(db); - - uint64_t i; - uint64_t j; - uint64_t row_idx = 0; - uint64_t cidx; - uint64_t total_col_chars = 1; - /* loop over all trails aka users */ - for (i = 0; i < tdb_num_trails(db); i++){ - const tdb_event *event; - tdb_get_trail(cursor, i); - - /* loop over all events */ - while ((event = tdb_cursor_next(cursor))){ - for (j = 0; j < event->num_items; j++){ - if (oh_field == tdb_item_field(event->items[j])){ - uint64_t len; - const char *val = tdb_get_item_value(db, event->items[j], &len); - if (ht_exists(col_mapping, val, len)){ - cidx = *((uint64_t*) (ht_get(col_mapping, val, len, NULL))); - } else { - uint64_t *tmp = malloc(sizeof max_col_idx); - *tmp = max_col_idx; - ht_set(col_mapping, val, len, tmp, 1); - cidx = max_col_idx; - max_col_idx += 1; - - char* col_name = malloc(sizeof(char)*(len+1)); - *((char *)memcpy(col_name, val, len+1)) = '\0'; - total_col_chars += len; - list_push_value(cols, col_name); - - uint64_t *list_len = (uint64_t*) malloc(sizeof(uint64_t)); - *list_len = len; - list_push_value(col_len, list_len); - } - if (summed <=0){ - row_idx_array[row_idx] = row_idx; - col_idx_array[row_idx] = cidx; - timestamps[row_idx] = event->timestamp; - - //uids[row_idx] = malloc(sizeof(uint8_t)*16); - memcpy(&uids[row_idx*16], tdb_get_uuid(db, i), 16); - - row_idx += 1; - } - break; - } - } - } - } - uint64_t n_cols = (uint64_t) list_count(cols); - char* cols_concat = (char*) malloc(sizeof(char)*(total_col_chars)); - cols_concat[0] = '\0'; - for (i = 0; i < n_cols; i++){ - char* tmp = (char *) list_pop_value(cols); - strcat(cols_concat, tmp); - free(tmp); - } - *col_names = cols_concat; - uint64_t* col_lens = (uint64_t*) malloc(sizeof(uint64_t)*n_cols); - for (i = 0; i < n_cols; i++){ - uint64_t *val = (uint64_t*) list_pop_value(col_len); - col_lens[i] = *val; - free(val); - } - *str_lens = col_lens; - tdb_cursor_free(cursor); - list_destroy(cols); - list_destroy(col_len); - ht_destroy(col_mapping); - tdb_close(db); - //printf("col names: %s\n", *col_names); - return n_cols; -} diff --git a/sparsity/src/traildb_coo.h b/sparsity/src/traildb_coo.h deleted file mode 100644 index 63ca7be..0000000 --- a/sparsity/src/traildb_coo.h +++ /dev/null @@ -1,17 +0,0 @@ -// -// traildb_coo.h -// traildb_to_sparse -// -// Created by Alan Höng on 19/02/2017. -// Copyright © 2017 Alan Höng. All rights reserved. -// - -#ifndef traildb_coo_h -#define traildb_coo_h -#include <stdio.h> -#include <traildb.h> -uint64_t traildb_coo_repr(const char* fname, const char* fieldname, - uint64_t* row_idx_array, uint64_t* col_idx_array, - uint8_t* uids, uint64_t* timestamps, - char** col_names, uint64_t** str_lens); -#endif /* traildb_coo_h */ diff --git a/sparsity/test/__init__.py b/sparsity/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sparsity/test/conftest.py b/sparsity/test/conftest.py index 165ac2b..24970f0 100644 --- a/sparsity/test/conftest.py +++ b/sparsity/test/conftest.py @@ -1,12 +1,85 @@ import os -import pytest +import shutil +import tempfile +from contextlib import contextmanager import numpy as np import pandas as pd - +import pytest import sparsity +# 2017 starts with a sunday +from sparsity import SparseFrame + + +@pytest.fixture() +def sampledata(): + def gendata(n, categorical=False): + sample_data = pd.DataFrame( + dict(date=pd.date_range("2017-01-01", periods=n))) + sample_data["weekday"] = sample_data.date.dt.weekday_name + sample_data["weekday_abbr"] = sample_data.weekday.apply( + lambda x: x[:3]) + + if categorical: + sample_data['weekday'] = sample_data['weekday'].astype('category') + sample_data['weekday_abbr'] = sample_data['weekday_abbr'] \ + .astype('category') + + sample_data["id"] = np.tile(np.arange(7), len(sample_data) // 7 + 1)[ + :len(sample_data)] + return sample_data + + return gendata + + +@pytest.fixture() +def sample_frame_labels(): + return SparseFrame(np.identity(5), + columns=list('ABCDE'), + index=list('VWXYZ')) + +@pytest.fixture() +def weekdays(): + return ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday'] + + +@pytest.fixture() +def weekdays_abbr(weekdays): + return list(map(lambda x: x[:3], weekdays)) + + +@pytest.fixture() +def groupby_frame(): + shuffle_idx = np.random.permutation(np.arange(100)) + index = np.tile(np.arange(10), 10) + data = np.vstack([np.identity(10) for _ in range(10)]) + t = SparseFrame(data[shuffle_idx, :], index=index[shuffle_idx]) + return t + + +@pytest.fixture() +def sf_midx(): + midx = pd.MultiIndex.from_arrays( + [pd.date_range("2016-10-01", periods=5), + np.arange(5)] + ) + cols = list('ABCDE') + sf = SparseFrame(np.identity(5), index=midx, columns=cols) + return sf + +@pytest.fixture() +def sf_midx_int(): + midx = pd.MultiIndex.from_arrays( + [np.concatenate([np.ones(4), np.zeros(1)]), + pd.date_range("2016-10-01", periods=5)] + ) + cols = list('ABCDE') + sf = SparseFrame(np.identity(5), index=midx, columns=cols) + return sf + @pytest.fixture() def testdb(): return os.path.join(sparsity.__path__[0], 'test/tiny.tdb') @@ -16,7 +89,47 @@ def testdb(): def clickstream(): df = pd.DataFrame(dict( page_id=np.random.choice(list('ABCDE'), size=100), + other_categorical=np.random.choice(list('FGHIJ'), size=100), id=np.random.choice([1,2,3,4,5,6,7,8,9], size=100) ), index=pd.date_range("2016-01-01", periods=100)) - return df \ No newline at end of file + return df + + +@pytest.fixture() +def complex_example(): + first = np.identity(10) + second = np.zeros((4, 10)) + third = np.zeros((4, 10)) + second[[0, 1, 2, 3], [2, 3, 4, 5]] = 10 + third[[0, 1, 2, 3], [6, 7, 8, 9]] = 20 + + shuffle_idx = np.arange(10) + np.random.shuffle(shuffle_idx) + + first = SparseFrame(first[shuffle_idx], + index=np.arange(10)[shuffle_idx]) + + shuffle_idx = np.arange(4) + np.random.shuffle(shuffle_idx) + + second = SparseFrame(second[shuffle_idx], + index=np.arange(2, 6)[shuffle_idx]) + + shuffle_idx = np.arange(4) + np.random.shuffle(shuffle_idx) + + third = SparseFrame(third[shuffle_idx], + index=np.arange(6, 10)[shuffle_idx]) + return first, second, third + + +@contextmanager +def tmpdir(dir=None): + dirname = tempfile.mkdtemp(dir=dir) + + try: + yield dirname + finally: + if os.path.exists(dirname): + shutil.rmtree(dirname, ignore_errors=True) diff --git a/sparsity/test/pydata.tdb b/sparsity/test/pydata.tdb deleted file mode 100644 index 4dd1b83..0000000 Binary files a/sparsity/test/pydata.tdb and /dev/null differ diff --git a/sparsity/test/test_coo.py b/sparsity/test/test_coo.py deleted file mode 100644 index 160d51a..0000000 --- a/sparsity/test/test_coo.py +++ /dev/null @@ -1,26 +0,0 @@ -import numpy as np -import pytest -try: - from sparsity._traildb import traildb_coo_repr_func - from sparsity.io import traildb_to_coo - trail_db = True -except (ImportError, OSError): - trail_db = False - -@pytest.mark.skipif(trail_db is False, reason="TrailDB is not installed") -def test_coo_func(testdb): - r_idx = np.zeros(9, dtype=np.uint64) - c_idx = np.zeros(9, dtype=np.uint64) - uuids = np.zeros((9,16), dtype=np.uint8) - timestamps = np.zeros(9, dtype=np.uint64) - res = traildb_coo_repr_func(testdb.encode(), b"username", r_idx, c_idx, - uuids, - timestamps) - assert all(r_idx == np.arange(9)) - assert all(c_idx[:3] == 0) - assert all(c_idx[3:6] == 1) - assert all(c_idx[6:] == 2) - -# def test_db_to_coo(testdb): -# res = traildb_to_coo(testdb, "action") -# pass diff --git a/sparsity/test/test_dask_sparse_frame.py b/sparsity/test/test_dask_sparse_frame.py index 4487307..dabfde1 100644 --- a/sparsity/test/test_dask_sparse_frame.py +++ b/sparsity/test/test_dask_sparse_frame.py @@ -1,32 +1,28 @@ -import shutil -import tempfile import os -from contextlib import contextmanager import dask +import dask.dataframe as dd +import datetime as dt +import numpy as np +import pandas as pd +import pandas.util.testing as pdt import pytest +from distributed import Client +from uuid import uuid4 import sparsity as sp import sparsity.dask as dsp -import pandas as pd -import numpy as np -import dask.dataframe as dd - from sparsity.dask.reshape import one_hot_encode +from .conftest import tmpdir -dask.context.set_options(get=dask.async.get_sync) - - -@contextmanager -def tmpdir(dir=None): - dirname = tempfile.mkdtemp(dir=dir) +dask.config.set(scheduler=dask.local.get_sync) - try: - yield dirname - finally: - if os.path.exists(dirname): - shutil.rmtree(dirname, ignore_errors=True) +@pytest.fixture +def dsf(): + return dsp.from_pandas(pd.DataFrame(np.random.rand(10,2), + columns=['A', 'B']), + npartitions=3) def test_from_pandas(): @@ -50,6 +46,74 @@ def test_map_partitions(): assert res.shape == (10, 2) +def test_todense(): + data = pd.DataFrame(np.random.rand(10, 2)) + dsf = dsp.from_pandas(data, npartitions=3) + res = dsf.todense() + assert isinstance(res, dd.DataFrame) + computed = res.compute() + pdt.assert_frame_equal(computed, data, check_dtype=False) + + +def test_todense_series(): + data = pd.DataFrame(np.random.rand(10, 2)) + dsf = dsp.from_pandas(data, npartitions=3)[0] + res = dsf.todense() + assert isinstance(res, dd.Series) + computed = res.compute() + pdt.assert_series_equal(computed, data[0], check_dtype=False) + + +# noinspection PyStatementEffect +@pytest.mark.parametrize('item, raises', [ + ('X', False), + (['X', 'Y'], False), + ('A', True), + (['A'], True), + (['X', 'A'], True), + (['A', 'B'], True), +]) +def test_getitem(item, raises): + df = pd.DataFrame(np.random.rand(10, 3), columns=list('XYZ'), + index=list('ABCDEFGHIJ')) + dsf = dsp.from_pandas(df, npartitions=2) + + correct_cols = item if isinstance(item, list) else [item] + + if raises: + with pytest.raises(KeyError): + dsf[item] + return + + res = dsf[item] + assert res.columns.tolist() == correct_cols + res_computed = res.compute() + assert res_computed.columns.tolist() == correct_cols + if not isinstance(item, list): + pdt.assert_series_equal(df[item], res_computed.todense()) + else: + pdt.assert_frame_equal(df[item], res_computed.todense()) + + +@pytest.mark.parametrize('item', [ + 'X', + ['X', 'Y'], +]) +def test_getitem_empty(item): + df = pd.DataFrame([], columns=list('XYZ'), dtype=int) + dsf = dsp.from_ddf(dd.from_pandas(df, npartitions=1)) + + correct_cols = item if isinstance(item, list) else [item] + res = dsf[item] + assert res.columns.tolist() == correct_cols + res_computed = res.compute() + assert res_computed.columns.tolist() == correct_cols + if not isinstance(item, list): + pdt.assert_series_equal(df[item], res_computed.todense()) + else: + pdt.assert_frame_equal(df[item], res_computed.todense()) + + @pytest.mark.parametrize('iindexer, correct_shape', [ (slice('A', 'B'), (2, 2)), (slice('C', None), (8, 2)), @@ -58,13 +122,40 @@ def test_map_partitions(): def test_loc(iindexer, correct_shape): df = pd.DataFrame(np.random.rand(10, 2), index=list('ABCDEFGHIJ')) + ddf = dd.from_pandas(df, npartitions=2) + ddf.loc[iindexer] + dsf = dsp.from_pandas(df, npartitions=2) - res = dsf.loc[iindexer].compute() + fut = dsf.loc[iindexer] + assert fut._meta.empty + res = fut.compute() assert isinstance(res, sp.SparseFrame) assert res.shape == correct_shape +def test_dask_loc(clickstream): + sf = one_hot_encode(dd.from_pandas(clickstream, npartitions=10), + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + index_col=['index', 'id']) + res = sf.loc['2016-01-15':'2016-02-15'] + res = res.compute() + assert res.index.levels[0].max().date() == dt.date(2016, 2, 15) + assert res.index.levels[0].min().date() == dt.date(2016, 1, 15) + + +def test_dask_multi_index_loc(clickstream): + sf = one_hot_encode(dd.from_pandas(clickstream, npartitions=10), + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + index_col=['index', 'id']) + res = sf.loc['2016-01-15':'2016-02-15'] + res = res.compute() + assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15) + assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15) + + def test_repr(): dsf = dsp.from_pandas(pd.DataFrame(np.random.rand(10, 2)), npartitions=3) @@ -75,16 +166,106 @@ def test_repr(): assert isinstance(dsf.__repr__(), str) -def test_one_hot(clickstream): +def test_one_hot_legacy(clickstream): ddf = dd.from_pandas(clickstream, npartitions=10) - dsf = one_hot_encode(ddf, column='page_id', - categories=list('ABCDE'), - index_col=['index', 'id']) + dsf = one_hot_encode(ddf, 'page_id', list('ABCDE'), ['index', 'id']) + assert dsf._meta.empty sf = dsf.compute() assert sf.shape == (100, 5) assert isinstance(sf.index, pd.MultiIndex) +def test_one_hot_no_order(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + index_col=['index', 'id']) + assert dsf._meta.empty + assert sorted(dsf.columns) == list('ABCDEFGHIJ') + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert sorted(sf.columns) == list('ABCDEFGHIJ') + + +def test_one_hot_no_order_categorical(clickstream): + clickstream['other_categorical'] = clickstream['other_categorical'] \ + .astype('category') + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + index_col=['index', 'id']) + assert dsf._meta.empty + assert sorted(dsf.columns) == list('ABCDEFGHIJ') + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert sorted(sf.columns) == list('ABCDEFGHIJ') + + +def test_one_hot_prefixes(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + index_col=['index', 'id'], + prefixes=True) + correct_columns = list(map(lambda x: 'page_id_' + x, list('ABCDE'))) \ + + list(map(lambda x: 'other_categorical_' + x, list('FGHIJ'))) + assert dsf._meta.empty + assert sorted(dsf.columns) == sorted(correct_columns) + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert sorted(sf.columns) == sorted(correct_columns) + + +def test_one_hot_order1(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + order=['page_id', 'other_categorical'], + index_col=['index', 'id']) + assert dsf._meta.empty + assert all(dsf.columns == list('ABCDEFGHIJ')) + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert all(sf.columns == list('ABCDEFGHIJ')) + + +def test_one_hot_order2(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + order=['other_categorical', 'page_id'], + index_col=['index', 'id']) + assert dsf._meta.empty + assert all(dsf.columns == list('FGHIJABCDE')) + sf = dsf.compute() + assert sf.shape == (100, 10) + assert isinstance(sf.index, pd.MultiIndex) + assert all(sf.columns == list('FGHIJABCDE')) + + +def test_one_hot_disk_categories(clickstream): + with tmpdir() as tmp: + cat_path = os.path.join(tmp, 'cat.pickle') + pd.Series(list('ABCDE')).to_pickle(cat_path) + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': cat_path}, + index_col=['index', 'id']) + assert dsf._meta.empty + sf = dsf.compute() + assert sf.shape == (100, 5) + assert isinstance(sf.index, pd.MultiIndex) + + def test_read_npz(): sf = sp.SparseFrame(np.identity(100)) with tmpdir() as tmp: @@ -93,6 +274,271 @@ def test_read_npz(): sf.iloc[50:75].to_npz(os.path.join(tmp, '3')) sf.iloc[75:].to_npz(os.path.join(tmp, '4')) - dsf = dsp.read_npz(os.path.join(tmp, '*.npz')) + dsf = dsp.read_npz(os.path.join(tmp, '*.npz'), read_divisions=True) sf = dsf.compute() - assert np.all(sf.data.toarray() == np.identity(100)) \ No newline at end of file + assert dsf.known_divisions + assert np.all(sf.data.toarray() == np.identity(100)) + + +def test_to_npz(dsf): + dense = dsf.compute().todense() + with tmpdir() as tmp: + path = os.path.join(tmp, '*.npz') + dsf.to_npz(path) + loaded = dsp.read_npz(path) + assert loaded.known_divisions + res = loaded.compute().todense() + pdt.assert_frame_equal(dense, res) + + +def test_assign_column(): + s = pd.Series(np.arange(10)) + ds = dd.from_pandas(s, npartitions=2) + + f = pd.DataFrame(np.random.rand(10, 2), columns=['a', 'b']) + dsf = dsp.from_pandas(f, npartitions=2) + + dsf = dsf.assign(new=ds) + assert dsf._meta.empty + sf = dsf.compute() + assert np.all((sf.todense() == f.assign(new=s)).values) + + +@pytest.mark.parametrize('arg_dict', [ + dict(divisions=[0, 30, 50, 70, 99]), + dict(npartitions=6), + dict(npartitions=2), +]) +def test_repartition_divisions(arg_dict): + df = pd.DataFrame(np.identity(100)) + dsf = dsp.from_pandas(df, npartitions=4) + + dsf2 = dsf.repartition(**arg_dict) + + assert isinstance(dsf2, dsp.SparseFrame) + if 'divisions' in arg_dict: + assert tuple(dsf2.divisions) == tuple(arg_dict['divisions']) + + df2 = dsf2.compute().todense() + pdt.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('start_part, end_part', [ + (2, 4), + (3, 2), + (3, 3), +]) +def test_repartition_n_divisions(start_part, end_part): + df = pd.DataFrame(np.identity(10)) + dsf = dsp.from_pandas(df, npartitions=start_part) + + dsf2 = dsf.repartition(npartitions=end_part) + + assert isinstance(dsf2, dsp.SparseFrame) + assert dsf2.npartitions == end_part + + df2 = dsf2.compute().todense() + pdt.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('how', ['left', 'right', 'inner', 'outer']) +def test_distributed_join(how): + left = pd.DataFrame(np.identity(10), + index=np.arange(10), + columns=list('ABCDEFGHIJ')) + right = pd.DataFrame(np.identity(10), + index=np.arange(5, 15), + columns=list('KLMNOPQRST')) + correct = left.join(right, how=how).fillna(0) + + d_left = dsp.from_pandas(left, npartitions=2) + d_right = dsp.from_pandas(right, npartitions=2) + + joined = d_left.join(d_right, how=how) + + res = joined.compute().todense() + + pdt.assert_frame_equal(correct, res) + + +def test_add(): + df = pd.DataFrame(np.identity(12)) + df2 = df.copy() + df2.index += 1 + + sf1 = sp.SparseFrame(df) + sf2 = sp.SparseFrame(df2) + correct = sf1.add(sf2).todense() + + dsf = dsp.from_pandas(df, npartitions=4) + dsf2 = dsp.from_pandas(df2, npartitions=4) + + res = dsf.add(dsf2).compute().todense() + pdt.assert_frame_equal(res, correct) + + +@pytest.mark.parametrize('idx', [ + np.random.choice([uuid4() for i in range(1000)], size=10000), + np.random.randint(0, 10000, 10000), + np.random.randint(0, 10000, 10000).astype(float), + pd.date_range('01-01-1970', periods=10000, freq='s'), +]) +def test_groupby_sum(idx): + for sorted in [True, False]: + df = pd.DataFrame(dict(A=np.ones(len(idx)), B=np.arange(len(idx))), + index=idx, dtype=np.float) + correct = df.groupby(level=0).sum() + correct.sort_index(inplace=True) + + spf = dsp.from_ddf(dd.from_pandas(df, npartitions=10, sort=sorted)) + assert spf.npartitions == 10 + grouped = spf.groupby_sum(split_out=4) + grouped2 = spf.groupby_sum(split_out=12) + + assert grouped.npartitions == 4 + res1 = grouped.compute().todense() + res1.sort_index(inplace=True) + + assert grouped2.npartitions == 12 + res2 = grouped2.compute().todense() + res2.sort_index(inplace=True) + + pdt.assert_frame_equal(res1, correct) + pdt.assert_frame_equal(res2, correct) + + +@pytest.mark.parametrize('how', ['left', 'inner']) +def test_distributed_join_shortcut(how): + left = pd.DataFrame(np.identity(10), + index=np.arange(10), + columns=list('ABCDEFGHIJ')) + right = pd.DataFrame(np.identity(10), + index=np.arange(5, 15), + columns=list('KLMNOPQRST')) + correct = left.join(right, how=how).fillna(0) + + d_left = dsp.from_pandas(left, npartitions=2) + d_right = sp.SparseFrame(right) + + joined = d_left.join(d_right, how=how) + + res = joined.compute().todense() + + pdt.assert_frame_equal(correct, res) + + +@pytest.mark.parametrize('idx, sorted', [ + (list('ABCD'*25), True), + (np.array(list('0123'*25)).astype(int), True), + (np.array(list('0123'*25)).astype(float), True), + (list('ABCD'*25), False), + (np.array(list('0123'*25)).astype(int), False), + (np.array(list('0123'*25)).astype(float), False), +]) +def test_groupby_sum(idx, sorted): + + df = pd.DataFrame(dict(A=np.ones(100), B=np.ones(100)), + index=idx) + correct = df.groupby(level=0).sum() + correct.sort_index(inplace=True) + + spf = dsp.from_pandas(df, npartitions=2) + if not sorted: + spf.divisions = [None] * (spf.npartitions + 1) + assert spf.npartitions == 2 + grouped = spf.groupby_sum(split_out=3) + + assert grouped.npartitions == 3 + res = grouped.compute().todense() + res.sort_index(inplace=True) + + pdt.assert_frame_equal(res, correct) + + +def test_from_ddf(): + ddf = dd.from_pandas( + pd.DataFrame(np.random.rand(20, 4), + columns=list('ABCD')), + npartitions=4 + ) + correct = ddf.compute() + + dsf = dsp.from_ddf(ddf) + + res = dsf.compute().todense() + + pdt.assert_frame_equal(correct, res) + + with pytest.raises(ValueError): + ddf = ddf.assign(A="some str value") + dsf = dsp.from_ddf(ddf) + + +def test_sdf_sort_index(): + data = pd.DataFrame(np.random.rand(20, 4), + columns=list('ABCD'), + index=np.random.choice([1,2,3,4,5,6], 20)) + ddf = dd.from_pandas(data, + npartitions=4, + sort=False, + ) + + dsf = dsp.from_ddf(ddf) + dsf = dsf.sort_index() + + assert dsf.known_divisions + + res = dsf.compute() + assert res.index.is_monotonic + assert res.columns.tolist() == list('ABCD') + + +def test_sdf_sort_index_auto_partition(): + data = pd.DataFrame(np.random.rand(20000, 4), + columns=list('ABCD'), + index=np.random.choice(list(range(5000)), 20000)) + ddf = dd.from_pandas(data, + npartitions=20, + sort=False, + ) + + dsf = dsp.from_ddf(ddf) + dsf = dsf.sort_index(npartitions='auto', partition_size=80000) + + assert dsf.known_divisions + assert dsf.npartitions == 16 + + res = dsf.compute() + assert res.index.is_monotonic + assert res.columns.tolist() == list('ABCD') + + +def test_get_partition(dsf): + correct = dsf.compute().todense() + parts = [dsf.get_partition(i).compute().todense() + for i in range(dsf.npartitions)] + res = pd.concat(parts, axis=0) + pdt.assert_frame_equal(res, correct) + + +def test_set_index(clickstream): + ddf = dd.from_pandas(clickstream, npartitions=10) + dsf = one_hot_encode(ddf, + categories={'page_id': list('ABCDE'), + 'other_categorical': list('FGHIJ')}, + order=['other_categorical', 'page_id'], + index_col=['index', 'id']) + dense = dsf.compute().set_index(level=1).todense() + res = dsf.set_index(level=1).compute().todense() + + pdt.assert_frame_equal(dense, res) + + +def test_persist(dsf): + correct = dsf.compute().todense() + client = Client() + persisted = client.persist(dsf) + + res = persisted.compute().todense() + + pdt.assert_frame_equal(res, correct) diff --git a/sparsity/test/test_sparse_frame.py b/sparsity/test/test_sparse_frame.py index 8e196cc..3cda67c 100644 --- a/sparsity/test/test_sparse_frame.py +++ b/sparsity/test/test_sparse_frame.py @@ -1,57 +1,71 @@ # coding=utf-8 -import os import datetime as dt -import pandas as pd +import os + +from contextlib import contextmanager -import dask.dataframe as dd import numpy as np +import pandas as pd +import pandas.testing as pdt import pytest -from dask.async import get_sync +from moto import mock_s3 from scipy import sparse - from sparsity import SparseFrame, sparse_one_hot - -try: - import traildb -except (ImportError, OSError): - traildb = False - - -# 2017 starts with a sunday -@pytest.fixture() -def sampledata(): - def gendata(n): - sample_data = pd.DataFrame( - dict(date=pd.date_range("2017-01-01", periods=n))) - sample_data["weekday"] = sample_data.date.dt.weekday_name - sample_data["id"] = np.tile(np.arange(7), len(sample_data) // 7 + 1)[ - :len(sample_data)] - return sample_data - - return gendata - - -@pytest.fixture() -def sf_midx(): - midx = pd.MultiIndex.from_arrays( - [pd.date_range("2016-10-01", periods=5), - np.arange(5)] - ) - cols = list('ABCDE') - sf = SparseFrame(np.identity(5), index=midx, columns=cols) - return sf +from sparsity.io_ import _csr_to_dict + +from .conftest import tmpdir + + +@contextmanager +def mock_s3_fs(bucket, data=None): + """Mocks an s3 bucket + + Parameters + ---------- + bucket: str + bucket name + data: dict + dictionary with paths relative to bucket and + bytestrings as values. Will mock data in bucket + if supplied. + + Returns + ------- + """ + try: + m = mock_s3() + m.start() + import boto3 + import s3fs + client = boto3.client('s3', region_name='us-east-1') + client.create_bucket(Bucket=bucket) + if data is not None: + data = data.copy() + for key, value in data.items(): + client.put_object(Bucket=bucket, Key=key, Body=value) + yield + finally: + if data is not None: + for key in data.keys(): + client.delete_object(Bucket=bucket, Key=key) + m.stop() def test_empty_init(): sf = SparseFrame(np.array([]), index=[], columns=['A', 'B']) assert sf.data.shape == (0, 2) + sf = SparseFrame(np.array([]), index=['A', 'B'], columns=[]) + assert sf.data.shape == (2, 0) -def test_groupby(): - shuffle_idx = np.random.permutation(np.arange(100)) - index = np.tile(np.arange(10), 10) - data = np.vstack([np.identity(10) for _ in range(10)]) - t = SparseFrame(data[shuffle_idx, :], index=index[shuffle_idx]) + +def test_empty_column_access(): + sf = SparseFrame(np.array([]), index=[], columns=['A', 'B', 'C', 'D']) + assert sf['D'].data.shape == (0, 1) + + +def test_groupby(groupby_frame): + t = groupby_frame res = t.groupby_sum().data.todense() assert np.all(res == (np.identity(10) * 10)) @@ -115,10 +129,9 @@ def test_mutually_exclusive_join(): left_ax0 = SparseFrame(np.identity(5), columns=np.arange(5)) right_ax0 = SparseFrame(np.identity(5), columns=np.arange(5, 10)) - with pytest.raises(NotImplementedError): # FIXME: remove when repaired - res_ax0 = left_ax0.join(right_ax0, axis=0) - assert np.all(res_ax0.data.todense() == correct), \ - "Joining along axis 0 failed." + res_ax0 = left_ax0.join(right_ax0, axis=0) + assert np.all(res_ax0.data.todense() == correct), \ + "Joining along axis 0 failed." assert np.all(res_ax1.data.todense() == correct), \ "Joining along axis 1 failed." @@ -155,6 +168,10 @@ def test_loc(): # test slices assert np.all(sf.loc[:'B'].data.todense() == np.identity(5)[:2]) + # test all + assert np.all(sf.loc[list("ABCDE")].data.todense() == np.identity(5)) + assert np.all(sf.loc[:, :].data.todense() == np.identity(5)) + assert np.all(sf.loc[:].data.todense() == np.identity(5)) sf = SparseFrame(np.identity(5), pd.date_range("2016-10-01", periods=5)) @@ -171,7 +188,7 @@ def test_loc(): np.identity(5)[:3]) -def test_loc_multi_index(sf_midx): +def test_loc_multi_index(sf_midx, sf_midx_int): assert sf_midx.loc['2016-10-01'].data[0, 0] == 1 @@ -187,6 +204,9 @@ def test_loc_multi_index(sf_midx): assert np.all(sf_midx.loc[dt_slice].data.todense() == np.identity(5)[:3]) + assert np.all(sf_midx_int.loc[1].todense().values == sf_midx.data[:4,:]) + assert np.all(sf_midx_int.loc[0].todense().values == sf_midx.data[4, :]) + def test_set_index(sf_midx): sf = sf_midx.set_index(level=1) @@ -211,6 +231,26 @@ def test_set_index(sf_midx): # assert np.all(sf.loc[[4, 5]].data.todense() == np.identity(5)[[3, 4]]) +def test_save_load_multiindex(sf_midx): + with tmpdir() as tmp: + # test new + path = os.path.join(tmp, 'sf.npz') + sf_midx.to_npz(path) + res = SparseFrame.read_npz(path) + assert isinstance(res.index, pd.MultiIndex) + + # test backwards compatibility + def _to_npz_legacy(sf, filename): + data = _csr_to_dict(sf.data) + data['frame_index'] = sf.index.values + data['frame_columns'] = sf.columns.values + np.savez(filename, **data) + + _to_npz_legacy(sf_midx, path) + res = SparseFrame.read_npz(path) + assert isinstance(res.index, pd.MultiIndex) + + def test_new_column_assign_array(): sf = SparseFrame(np.identity(5)) sf[6] = np.ones(5) @@ -262,34 +302,6 @@ def test_existing_column_assign_number(): assert np.all(correct == sf.data.todense()) -@pytest.fixture() -def complex_example(): - first = np.identity(10) - second = np.zeros((4, 10)) - third = np.zeros((4, 10)) - second[[0, 1, 2, 3], [2, 3, 4, 5]] = 10 - third[[0, 1, 2, 3], [6, 7, 8, 9]] = 20 - - shuffle_idx = np.arange(10) - np.random.shuffle(shuffle_idx) - - first = SparseFrame(first[shuffle_idx], - index=np.arange(10)[shuffle_idx]) - - shuffle_idx = np.arange(4) - np.random.shuffle(shuffle_idx) - - second = SparseFrame(second[shuffle_idx], - index=np.arange(2, 6)[shuffle_idx]) - - shuffle_idx = np.arange(4) - np.random.shuffle(shuffle_idx) - - third = SparseFrame(third[shuffle_idx], - index=np.arange(6, 10)[shuffle_idx]) - return first, second, third - - def test_add_total_overlap(complex_example): first, second, third = complex_example correct = first.sort_index().data.todense() @@ -342,7 +354,19 @@ def test_add_no_overlap(complex_example): assert np.all(res.data.todense() == correct) -def test_csr_one_hot_series(sampledata): +def test_csr_one_hot_series_disk_categories(sampledata): + with tmpdir() as tmp: + categories = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', + 'Thursday', 'Friday', 'Saturday'] + cat_path = os.path.join(tmp, 'bla.pickle') + pd.Series(categories).to_pickle(cat_path) + sparse_frame = sparse_one_hot(sampledata(49), + categories={'weekday': cat_path}) + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)).data.todense() + assert np.all(res == np.identity(7) * 7) + + +def test_csr_one_hot_series_legacy(sampledata): categories = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'] sparse_frame = sparse_one_hot(sampledata(49), 'weekday', categories) @@ -350,10 +374,175 @@ def test_csr_one_hot_series(sampledata): assert np.all(res == np.identity(7) * 7) +def test_csr_one_hot_series(sampledata, weekdays, weekdays_abbr): + correct = np.hstack((np.identity(7) * 7, + np.identity(7) * 7)) + + categories = {'weekday': weekdays, + 'weekday_abbr': weekdays_abbr} + + sparse_frame = sparse_one_hot(sampledata(49), categories=categories, + order=['weekday', 'weekday_abbr']) + + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)).data.todense() + assert np.all(res == correct) + assert all(sparse_frame.columns == (weekdays + weekdays_abbr)) + + +def test_csr_one_hot_series_categorical_same_order(sampledata, weekdays, + weekdays_abbr): + correct = np.hstack((np.identity(7) * 7, + np.identity(7) * 7)) + + data = sampledata(49, categorical=True) + + categories = {'weekday': data['weekday'].cat.categories.tolist(), + 'weekday_abbr': data['weekday_abbr'].cat.categories.tolist()} + + sparse_frame = sparse_one_hot(data, + categories=categories, + order=['weekday', 'weekday_abbr'], + ignore_cat_order_mismatch=False) + + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)) \ + .todense()[weekdays + weekdays_abbr].values + assert np.all(res == correct) + assert set(sparse_frame.columns) == set(weekdays + weekdays_abbr) + + +def test_csr_one_hot_series_categorical_different_order(sampledata, weekdays, + weekdays_abbr): + correct = np.hstack((np.identity(7) * 7, + np.identity(7) * 7)) + + data = sampledata(49, categorical=True) + + categories = { + 'weekday': data['weekday'].cat.categories.tolist()[::-1], + 'weekday_abbr': data['weekday_abbr'].cat.categories.tolist()[::-1] + } + + with pytest.raises(ValueError): + sparse_frame = sparse_one_hot(data, + categories=categories, + order=['weekday', 'weekday_abbr'], + ignore_cat_order_mismatch=False) + + +def test_csr_one_hot_series_categorical_different_order_ignore( + sampledata, weekdays, weekdays_abbr): + + correct = np.hstack((np.identity(7) * 7, + np.identity(7) * 7)) + + data = sampledata(49, categorical=True) + + categories = { + 'weekday': data['weekday'].cat.categories.tolist()[::-1], + 'weekday_abbr': data['weekday_abbr'].cat.categories.tolist()[::-1] + } + + sparse_frame = sparse_one_hot(data, + categories=categories, + order=['weekday', 'weekday_abbr'], + ignore_cat_order_mismatch=True) + + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)) \ + .todense()[weekdays + weekdays_abbr].values + assert np.all(res == correct) + assert set(sparse_frame.columns) == set(weekdays + weekdays_abbr) + + +def test_csr_one_hot_series_categorical_no_categories( + sampledata, weekdays, weekdays_abbr): + + correct = np.hstack((np.identity(7) * 7, + np.identity(7) * 7)) + + data = sampledata(49, categorical=True) + + categories = { + 'weekday': None, + 'weekday_abbr': None + } + + sparse_frame = sparse_one_hot(data, + categories=categories, + order=['weekday', 'weekday_abbr'], + ignore_cat_order_mismatch=True) + + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)) \ + .todense()[weekdays + weekdays_abbr].values + assert np.all(res == correct) + assert set(sparse_frame.columns) == set(weekdays + weekdays_abbr) + + +def test_csr_one_hot_series_other_order(sampledata, weekdays, weekdays_abbr): + + categories = {'weekday': weekdays, + 'weekday_abbr': weekdays_abbr} + + sparse_frame = sparse_one_hot(sampledata(49), categories=categories, + order=['weekday_abbr', 'weekday']) + + assert all(sparse_frame.columns == (weekdays_abbr + weekdays)) + + +def test_csr_one_hot_series_no_order(sampledata, weekdays, weekdays_abbr): + + categories = {'weekday': weekdays, + 'weekday_abbr': weekdays_abbr} + + sparse_frame = sparse_one_hot(sampledata(49), categories=categories) + + assert sorted(sparse_frame.columns) == sorted(weekdays_abbr + weekdays) + + +def test_csr_one_hot_series_prefixes(sampledata, weekdays, weekdays_abbr): + correct = np.hstack((np.identity(7) * 7, + np.identity(7) * 7)) + + categories = {'weekday': weekdays, + 'weekday_abbr': weekdays_abbr} + + sparse_frame = sparse_one_hot(sampledata(49), categories=categories, + order=['weekday', 'weekday_abbr'], + prefixes=True) + + res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)).data.todense() + assert np.all(res == correct) + correct_columns = list(map(lambda x: 'weekday_' + x, weekdays)) \ + + list(map(lambda x: 'weekday_abbr_' + x, weekdays_abbr)) + assert all(sparse_frame.columns == correct_columns) + + +def test_csr_one_hot_series_same_categories(weekdays): + sample_data = pd.DataFrame( + dict(date=pd.date_range("2017-01-01", periods=7))) + sample_data["weekday"] = sample_data.date.dt.weekday_name + sample_data["weekday2"] = sample_data.date.dt.weekday_name + + categories = {'weekday': weekdays, + 'weekday2': weekdays} + + with pytest.raises(ValueError): + sparse_one_hot(sample_data, categories=categories, + order=['weekday', 'weekday2']) + + sparse_frame = sparse_one_hot(sample_data, categories=categories, + order=['weekday', 'weekday2'], + prefixes=True) + + correct_columns = list(map(lambda x: 'weekday_' + x, weekdays)) \ + + list(map(lambda x: 'weekday2_' + x, weekdays)) + assert all(sparse_frame.columns == correct_columns) + + def test_csr_one_hot_series_too_much_categories(sampledata): categories = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Yesterday', 'Saturday', 'Birthday'] - sparse_frame = sparse_one_hot(sampledata(49), 'weekday', categories) + sparse_frame = sparse_one_hot(sampledata(49), + categories={'weekday': categories}) res = sparse_frame.groupby_sum(np.tile(np.arange(7), 7)).data.todense() correct = np.identity(7) * 7 @@ -367,20 +556,7 @@ def test_csr_one_hot_series_too_little_categories(sampledata): categories = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] with pytest.raises(ValueError): - sparse_one_hot(sampledata(49), 'weekday', categories) - - -@pytest.mark.skipif(traildb is False, reason="TrailDB not installed") -def test_read_traildb(testdb): - res = SparseFrame.read_traildb(testdb, 'action') - assert res.shape == (9, 3) - - -@pytest.mark.skipif(traildb is False, reason="TrailDB not installed") -def test_add_traildb(testdb): - simple = SparseFrame.read_traildb(testdb, 'action') - doubled = simple.add(simple) - assert np.all(doubled.data.todense() == simple.data.todense() * 2) + sparse_one_hot(sampledata(49), categories={'weekday': categories}) def test_npz_io(complex_example): @@ -393,13 +569,79 @@ def test_npz_io(complex_example): os.remove('/tmp/sparse.npz') +def test_npz_io_s3(complex_example): + with mock_s3_fs('sparsity'): + sf, second, third = complex_example + sf.to_npz('s3://sparsity/sparse.npz') + loaded = SparseFrame.read_npz('s3://sparsity/sparse.npz') + assert np.all(loaded.data.todense() == sf.data.todense()) + assert np.all(loaded.index == sf.index) + assert np.all(loaded.columns == sf.columns) + + +# noinspection PyStatementEffect def test_getitem(): - sf = SparseFrame(np.identity(10), columns=list('abcdefghij')) + id_ = np.identity(10) + sf = SparseFrame(id_, columns=list('abcdefghij')) + assert sf['a'].data.todense()[0] == 1 assert sf['j'].data.todense()[9] == 1 + assert np.all(sf[['a', 'b']].data.todense() == np.asmatrix(id_[:, [0, 1]])) tmp = sf[['j', 'a']].data.todense() assert tmp[9, 0] == 1 assert tmp[0, 1] == 1 + assert (sf[list('abcdefghij')].data.todense() == np.identity(10)).all() + assert sf[[]].shape == (10, 0) + assert len(sf[[]].columns) == 0 + assert isinstance(sf.columns, type(sf[[]].columns)) + with pytest.raises(ValueError): + sf[None] + + idx = pd.Index(list('abc')) + pdt.assert_index_equal(idx, sf[idx].columns) + pdt.assert_index_equal(idx, sf[idx.to_series()].columns) + pdt.assert_index_equal(idx, sf[idx.tolist()].columns) + pdt.assert_index_equal(idx, sf[tuple(idx)].columns) + pdt.assert_index_equal(idx, sf[idx.values].columns) + + +def test_getitem_empty(): + df = pd.DataFrame([], columns=list('abcdefghij'), dtype=float) + sf = SparseFrame(df) + + assert sf['a'].empty + assert sf['a'].columns.tolist() == ['a'] + assert sf[['a', 'b']].empty + assert sf[['a', 'b']].columns.tolist() == ['a', 'b'] + + +# noinspection PyStatementEffect +def test_getitem_missing_col(): + id_ = np.identity(10) + sf = SparseFrame(id_, columns=list('abcdefghij')) + + with pytest.raises(ValueError): + sf[None] + with pytest.raises(KeyError): + sf['x'] + with pytest.raises(KeyError): + sf[['x']] + with pytest.raises(KeyError): + sf[['a', 'x']] + with pytest.raises(KeyError): + sf[['y', 'x']] + + idx = pd.Index(list('abx')) + with pytest.raises(KeyError): + sf[idx] + with pytest.raises(KeyError): + sf[idx.to_series()] + with pytest.raises(KeyError): + sf[idx.tolist()] + with pytest.raises(KeyError): + sf[tuple(idx)] + with pytest.raises(KeyError): + sf[idx.values] def test_vstack(): @@ -424,12 +666,10 @@ def test_vstack_multi_index(clickstream): df_0 = clickstream.iloc[:len(clickstream) // 2] df_1 = clickstream.iloc[len(clickstream) // 2:] sf_0 = sparse_one_hot(df_0, - categories=list('ABCDE'), - column='page_id', + categories={'page_id': list('ABCDE')}, index_col=['index', 'id']) sf_1 = sparse_one_hot(df_1, - categories=list('ABCDE'), - column='page_id', + categories={'page_id': list('ABCDE')}, index_col=['index', 'id']) res = SparseFrame.vstack([sf_0, sf_1]) assert isinstance(res.index, pd.MultiIndex) @@ -443,36 +683,6 @@ def test_boolean_indexing(): assert res.index.tolist() == [3, 4] -def test_dask_loc(clickstream): - sf = dd.from_pandas(clickstream, npartitions=10) \ - .map_partitions( - sparse_one_hot, - column='page_id', - categories=list('ABCDE'), - meta=list - ) - - res = sf.loc['2016-01-15':'2016-02-15'] - res = SparseFrame.concat(res.compute(get=get_sync).tolist()) - assert res.index.date.max() == dt.date(2016, 2, 15) - assert res.index.date.min() == dt.date(2016, 1, 15) - - -def test_dask_multi_index_loc(clickstream): - sf = dd.from_pandas(clickstream, npartitions=10) \ - .map_partitions( - sparse_one_hot, - column='page_id', - index_col=['index', 'id'], - categories=list('ABCDE'), - meta=list - ) - res = sf.loc['2016-01-15':'2016-02-15'] - res = SparseFrame.vstack(res.compute(get=get_sync).tolist()) - assert res.index.get_level_values(0).date.min() == dt.date(2016, 1, 15) - assert res.index.get_level_values(0).date.max() == dt.date(2016, 2, 15) - - def test_rename(): old_names = list('ABCDE') func = lambda x: x + '_new' @@ -512,17 +722,67 @@ def test_drop_duplicate_idx(): def test_repr(): + sf = SparseFrame(sparse.csr_matrix((2, 3))) + res = sf.__repr__() + assert isinstance(res, str) + assert len(res.splitlines()) == 1 + 2 + 2 # column names + 2 rows + descr. + sf = SparseFrame(sparse.csr_matrix((10, 10000))) res = sf.__repr__() assert isinstance(res, str) assert '10x10000' in res assert '0 stored' in res - sf = SparseFrame(np.array([]), index=[], columns=['A', 'B']) + sf = SparseFrame(sparse.csr_matrix((10000, 10000))) + res = sf.__repr__() + assert isinstance(res, str) + + sf = SparseFrame(np.empty(shape=(0, 2)), index=[], columns=['A', 'B']) + res = sf.__repr__() + assert isinstance(res, str) + + sf = SparseFrame(np.empty(shape=(0, 200)), index=[], + columns=np.arange(200)) res = sf.__repr__() assert isinstance(res, str) +def test_groupby_agg(groupby_frame): + res = groupby_frame.groupby_agg( + level=0, + agg_func=lambda x: x.sum(axis=0) + ).data.todense() + assert np.all(res == (np.identity(10) * 10)) + + res = groupby_frame.groupby_agg( + level=0, + agg_func=lambda x: x.mean(axis=0) + ) + assert np.all(res.data.todense().round() == np.identity(10)) + + assert np.all(res.columns == groupby_frame.columns) + assert np.all(res.index == groupby_frame.index.unique().sort_values()) + + +def test_groupby_agg_multiindex(): + df = pd.DataFrame({'X': [1, 1, 1, 0], + 'Y': [0, 1, 0, 1], + 'gr': ['a', 'a', 'b', 'b'], + 'day': [10, 11, 11, 12]}) + df = df.set_index(['day', 'gr']) + sf = SparseFrame(df) + + correct = df.groupby(level=1).mean() + res = sf.groupby_agg(level=1, agg_func=lambda x: x.mean(axis=0)) + assert np.all(res.index == correct.index) + assert np.all(res.columns == correct.columns) + + correct = df.groupby(by='Y').mean() + res = sf.groupby_agg(by='Y', agg_func=lambda x: x.mean(axis=0)) + assert np.all(res.index == correct.index) + assert np.all(res.columns == correct.columns) + + def test_init_with_pandas(): df = pd.DataFrame(np.identity(5), index=[ @@ -533,7 +793,13 @@ def test_init_with_pandas(): sf = SparseFrame(df) assert sf.shape == (5, 5) assert isinstance(sf.index, pd.MultiIndex) - assert sf.columns.tolist() == list('ABCDE') + assert (sf.index == df.index).all() + assert (sf.columns == df.columns).all() + + with pytest.warns(SyntaxWarning): + sf = SparseFrame(df, index=np.arange(10, 15), columns=list('VWXYZ')) + assert sf.index.tolist() == np.arange(10, 15).tolist() + assert sf.columns.tolist() == list('VWXYZ') s = pd.Series(np.ones(10)) sf = SparseFrame(s) @@ -544,3 +810,205 @@ def test_init_with_pandas(): df['A'] = 'bla' with pytest.raises(TypeError): sf = SparseFrame(df) + + +def test_multiply_rowwise(): + # Row wise multiplication with different types + sf = SparseFrame(np.ones((5, 5))) + other = np.arange(5) + msg = "Row wise multiplication failed" + + # list + res = sf.multiply(list(other), axis=0) + assert np.all(res.sum(axis=1).T == 5 * other), msg + + # 1D array + res = sf.multiply(other, axis=0) + assert np.all(res.sum(axis=1).T == 5 * other), msg + + # 2D array + _other = other.reshape(-1, 1) + res = sf.multiply(_other, axis=0) + assert np.all(res.sum(axis=1).T == 5 * other), msg + + # SparseFrame + _other = SparseFrame(other) + res = sf.multiply(_other, axis=0) + assert np.all(res.sum(axis=1).T == 5 * other), msg + + # csr_matrix + _other = _other.data + res = sf.multiply(_other, axis=0) + assert np.all(res.sum(axis=1).T == 5 * other), msg + + +def test_multiply_colwise(): + # Column wise multiplication with different types + sf = SparseFrame(np.ones((5, 5))) + other = np.arange(5) + msg = "Column wise multiplication failed" + + # list + res = sf.multiply(list(other), axis=1) + assert np.all(res.sum(axis=0) == 5 * other), msg + + # 1D array + res = sf.multiply(other, axis=1) + assert np.all(res.sum(axis=0) == 5 * other), msg + + # 2D array + _other = other.reshape(1, -1) + res = sf.multiply(_other, axis=1) + assert np.all(res.sum(axis=0) == 5 * other), msg + + # SparseFrame + _other = SparseFrame(other) + res = sf.multiply(_other, axis=1) + assert np.all(res.sum(axis=0) == 5 * other), msg + + # csr_matrix + _other = _other.data + _other.toarray() + res = sf.multiply(_other, axis=1) + assert np.all(res.sum(axis=0) == 5 * other), msg + + +def test_multiply_wrong_axis(): + sf = SparseFrame(np.ones((5, 5))) + other = np.arange(5) + + with pytest.raises(ValueError): + sf.multiply(other, axis=2) + + +def test_drop_single_label(): + old_names = list('ABCDE') + sf = SparseFrame(np.identity(5), columns=old_names) + sf = sf.drop('A', axis=1) + + correct = np.identity(5)[:, 1:] + assert sf.columns.tolist() == list('BCDE') + np.testing.assert_array_equal(sf.data.todense(), correct) + + +def test_drop_non_existing_label(): + old_names = list('ABCDE') + sf = SparseFrame(np.identity(5), columns=old_names) + sf = sf.drop('Z', axis=1) + + +def test_drop_multiple_labels(): + old_names = list('ABCDE') + sf = SparseFrame(np.identity(5), columns=old_names) + sf = sf.drop(['A', 'C'], axis=1) + + correct = np.identity(5)[:, [1, 3, 4]] + assert sf.columns.tolist() == list('BDE') + np.testing.assert_array_equal(sf.data.todense(), correct) + + +def test_label_based_indexing_col(sample_frame_labels): + key = ['A', 'B'] + results = [ + sample_frame_labels[key], + sample_frame_labels.loc[:, key], + sample_frame_labels.reindex(columns=key) + ] + for res in results: + np.testing.assert_array_equal( + res.data.todense(), np.identity(5)[:, :2]) + assert (res.index == pd.Index(list('VWXYZ'))).all() + assert (res.columns == pd.Index(list('AB'))).all() + + +def test_label_based_indexing_idx(sample_frame_labels): + key = ['X', 'Y', 'Z'] + results = [ + sample_frame_labels.loc[key], + sample_frame_labels.loc[key, :], + sample_frame_labels.reindex(labels=key, axis=0), + sample_frame_labels.reindex(index=key) + ] + for res in results: + np.testing.assert_array_equal( + res.data.todense(), np.identity(5)[2:, :]) + assert (res.index == pd.Index(['X', 'Y', 'Z'])).all() + assert (res.columns == pd.Index(list('ABCDE'))).all() + + +def test_label_based_col_and_idx(sample_frame_labels): + key = ['V', 'W'], ['A', 'B'] + results = [ + sample_frame_labels.loc[key], + sample_frame_labels.loc[['V', 'W'], ['A', 'B']], + sample_frame_labels.reindex(index=key[0], columns=key[1]) + ] + for res in results: + np.testing.assert_array_equal( + res.data.todense(), np.identity(2)) + assert (res.index == pd.Index(list('VW'))).all() + assert (res.columns == pd.Index(list('AB'))).all() + + +def test_indexing_boolean_label_col_and_idx(sample_frame_labels): + res = sample_frame_labels.loc[[True, True, False, False, False], ['A', 'B']] + np.testing.assert_array_equal( + res.data.todense(), np.identity(2)) + assert (res.index == pd.Index(list('VW'))).all() + assert (res.columns == pd.Index(list('AB'))).all() + + res = sample_frame_labels.loc[['V', 'W'], [True, True, False, False, False]] + np.testing.assert_array_equal( + res.data.todense(), np.identity(2)) + assert (res.index == pd.Index(list('VW'))).all() + assert (res.columns == pd.Index(list('AB'))).all() + + +def test_error_reindex_duplicate_axis(): + sf = SparseFrame(np.identity(5), + columns = list('ABCDE'), + index = list('UUXYZ')) + with pytest.raises(ValueError): + sf.reindex(['U', 'V']) + + +def test_empty_elemwise(): + sf_empty = SparseFrame(np.array([]), columns=['A', 'B']) + sf = SparseFrame(np.identity(2), columns=['A', 'B']) + + res = sf_empty.add(sf).data.todense() + assert np.all(res == sf.data.todense()) + + res = sf.add(sf_empty).data.todense() + assert np.all(res == sf.data.todense()) + + with pytest.raises(ValueError): + res = sf.add(sf_empty, fill_value=None) + + +def test_loc_duplicate_index(): + sf = SparseFrame(np.identity(5), + columns=list('UUXYZ'), + index=list('AAABB')) + assert len(sf.loc['A'].index) == 3 + assert len(sf.loc['B'].index) == 2 + assert np.all(sf.loc['A'].todense().values == np.identity(5)[:3]) + assert np.all(sf.loc['B'].todense().values == np.identity(5)[3:]) + + assert len(sf.loc[:, 'U'].columns) == 2 + assert np.all(sf.loc[:, 'U'].todense().values == np.identity(5)[:, :2]) + + +def test_error_unaligned_indices(): + data = np.identity(5) + with pytest.raises(ValueError) as e: + SparseFrame(data, index=np.arange(6)) + assert '(5, 5)' in str(e) and '(6, 5)' in str(e) + + with pytest.raises(ValueError) as e: + SparseFrame(data, columns=np.arange(6)) + assert '(5, 5)' in str(e) and '(5, 6)' in str(e) + + with pytest.raises(ValueError) as e: + SparseFrame(data, columns=np.arange(6), index=np.arange(6)) + assert '(5, 5)' in str(e) and '(6, 6)' in str(e) diff --git a/sparsity/test/tiny.tdb b/sparsity/test/tiny.tdb deleted file mode 100644 index a15a0ab..0000000 Binary files a/sparsity/test/tiny.tdb and /dev/null differ diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 0000000..64fea1c --- /dev/null +++ b/versioneer.py @@ -0,0 +1,1822 @@ + +# Version: 0.18 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/warner/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy +* [![Latest Version] +(https://pypip.in/version/versioneer/badge.svg?style=flat) +](https://pypi.python.org/pypi/versioneer/) +* [![Build Status] +(https://travis-ci.org/warner/python-versioneer.png?branch=master) +](https://travis-ci.org/warner/python-versioneer) + +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere to your $PATH +* add a `[versioneer]` section to your setup.cfg (see below) +* run `versioneer install` in your source tree, commit the results + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes. + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Known Limitations + +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/warner/python-versioneer/issues). + +### Subprojects + +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: + +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other langauges) in subdirectories. + +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). + +`pip install --editable .` should work correctly. `setup.py install` might +work too. + +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. + +[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. + +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. + +### Editable installs with setuptools <= 18.5 + +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. + +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. + +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. + +[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. + +### Unicode version strings + +While Versioneer works (and is continually tested) with both Python 2 and +Python 3, it is not entirely consistent with bytes-vs-unicode distinctions. +Newer releases probably generate unicode version strings on py2. It's not +clear that this is wrong, but it may be surprising for applications when then +write these strings to a network connection or include them in bytes-oriented +APIs like cryptographic checksums. + +[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates +this question. + + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +""" + +from __future__ import print_function +try: + import configparser +except ImportError: + import ConfigParser as configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + me = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(me)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py)) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.SafeConfigParser() + with open(setup_cfg, "r") as f: + parser.readfp(f) + VCS = parser.get("versioneer", "VCS") # mandatory + + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = get(parser, "style") or "" + cfg.versionfile_source = get(parser, "versionfile_source") + cfg.versionfile_build = get(parser, "versionfile_build") + cfg.tag_prefix = get(parser, "tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = get(parser, "parentdir_prefix") + cfg.verbose = get(parser, "verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode + + +LONG_VERSION_PY['git'] = ''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.18 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, p.returncode + return stdout, p.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], + cwd=root)[0].strip() + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + me = __file__ + if me.endswith(".pyc") or me.endswith(".pyo"): + me = os.path.splitext(me)[0] + ".py" + versioneer_file = os.path.relpath(me) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + f = open(".gitattributes", "r") + for line in f.readlines(): + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + f.close() + except EnvironmentError: + pass + if not present: + f = open(".gitattributes", "a+") + f.write("%s export-subst\n" % versionfile_source) + f.close() + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %s but none started with prefix %s" % + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.18) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except EnvironmentError: + raise NotThisMethod("unable to read _version.py") + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version", + "date": None} + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(): + """Get the custom setuptools/distutils subclasses used by Versioneer.""" + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/warner/python-versioneer/issues/52 + + cmds = {} + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if 'py2exe' in sys.modules: # py2exe enabled? + try: + from py2exe.distutils_buildexe import py2exe as _py2exe # py3 + except ImportError: + from py2exe.build_exe import py2exe as _py2exe # py2 + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +INIT_PY_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + + +def do_setup(): + """Main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (EnvironmentError, configparser.NoSectionError, + configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except EnvironmentError: + old = "" + if INIT_PY_SNIPPET not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(INIT_PY_SNIPPET) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except EnvironmentError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1)