kayibal · kayibal · Apr 17, 2017 · Apr 17, 2017 · Apr 18, 2017 · Apr 18, 2017
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -0,0 +1,12 @@
+version: 2
+jobs:
+  build:
+    working_directory: ~/sparsity
+    docker:
+      - image: drtools/dask:latest
+    steps:
+      - checkout
+      - run: pip install boto3==1.7.84 botocore==1.10.84 moto==1.3.6
+      - run: pip install pytest pytest-cov dask==1.0.0 .
+      - run: py.test --cov sparsity --cov-report xml sparsity
+      - run: bash <(curl -s https://codecov.io/bash)
diff --git a/.coveragerc b/.coveragerc
@@ -1,2 +1,2 @@
 [run]
-omit = sparsity/test/*, */__init__.py
+omit = sparsity/test/*, */__init__.py, */_version.py
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+sparsity/_version.py export-subst
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ build/
 *.so
 traildb_sparse.c
 __pycache__
+*.egg-info
+*.npz
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,24 @@
+Copyright (c) 2018, Data Revenue
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+include versioneer.py
+include sparsity/_version.py
diff --git a/Makefile b/Makefile
diff --git a/README.md b/README.md
@@ -2,121 +2,16 @@
 [![CircleCI](https://circleci.com/gh/datarevenue-berlin/sparsity.svg?style=svg)](https://circleci.com/gh/datarevenue-berlin/sparsity)
 [![Codecov](https://img.shields.io/codecov/c/github/datarevenue-berlin/sparsity.svg)](https://codecov.io/gh/datarevenue-berlin/sparsity)
 
-Sparse data processing toolbox. It builds on top of pandas and scipy to provide DataFrame
-like API to work with sparse categorical data. 
-
-It also provides a extremly fast C level 
-interface to read from traildb databases. This make it a highly performant package to use
-for dataprocessing jobs especially such as log processing and/or clickstream ot click through data. 
+Sparse data processing toolbox. It builds on top of pandas and scipy to provide 
+DataFrame-like API to work with sparse data. 
 
 In combination with dask it provides support to execute complex operations on 
 a concurrent/distributed level.
 
-## Attention
-**Not ready for production**
-
-# Motivation
-Many tasks especially in data analytics and machine learning domain make use of sparse
-data structures to support the input of high dimensional data. 
-
-This project was started
-to build an efficient homogen sparse data processing pipeline. As of today dask has no
-support for something as an sparse dataframe. We process big amounts of highdimensional data
-on a daily basis at [datarevenue](http://datarevenue.com) and our favourite language 
-and ETL framework are python and dask. After chaining many function calls on scipy.sparse 
-csr matrices that involved handling of indices and column names to produce a sparse data
-pipeline I decided to start this project.
-
-This package might be especially usefull to you if you have very big amounts of 
-sparse data such as clickstream data, categorical timeseries, log data or similarly sparse data.
-
-# Traildb access?
-[Traildb](http://traildb.io/) is an amazing log style database. It was released recently 
-by AdRoll. It compresses event like data extremly efficient. Furthermore it provides a 
-fast C-level api to query it. 
-
-Traildb has also python bindings but you still might need to iterate over many million 
-of users/trail or even both which has quite some overhead in python. 
-Therefore sparsity provides high speed access to the database in form of SparseFrame objects. 
-These are fast, efficient and intuitive enough to do further processing on. 
-
-*ATM uuid and timestamp informations are lost but they will be provided as a pandas.MultiIndex 
-handled by the SparseFrame in a (very soon) future release.*
-
-````
-In [1]: from sparsity import SparseFrame
-
-In [2]: sdf = SparseFrame.read_traildb('pydata.tdb', field="title")
-
-In [3]: sdf.head()
-Out[3]: 
-   0      1      2      3      4      ...    37388  37389  37390  37391  37392
-0    1.0    0.0    0.0    0.0    0.0  ...      0.0    0.0    0.0    0.0    0.0
-1    1.0    0.0    0.0    0.0    0.0  ...      0.0    0.0    0.0    0.0    0.0
-2    1.0    0.0    0.0    0.0    0.0  ...      0.0    0.0    0.0    0.0    0.0
-3    1.0    0.0    0.0    0.0    0.0  ...      0.0    0.0    0.0    0.0    0.0
-4    1.0    0.0    0.0    0.0    0.0  ...      0.0    0.0    0.0    0.0    0.0
-
-[5 rows x 37393 columns]
+More information and examples can be found in the [documentation](https://sparsity.readthedocs.io).
 
-In [6]: %%timeit
-   ...: sdf = SparseFrame.read_traildb("/Users/kayibal/Code/traildb_to_sparse/traildb_to_sparse/traildb_to_sparse/sparsity/test/pydata.tdb", field="title")
-   ...: 
-10 loops, best of 3: 73.8 ms per loop
 
-In [4]: sdf.shape
-Out[4]: (109626, 37393)
-````
-
-# But wait pandas has SparseDataFrames and SparseSeries
-Pandas has it's own implementation of sparse datastructures. Unfortuantely this structures
-performs quite badly with a groupby sum aggregation which we also often use. Furthermore
- doing a groupby on a pandasSparseDataFrame returns a dense DataFrame. This makes chaining
-  many groupby operations over multiple files cumbersome and less efficient. Consider 
-following example:
-
-```
-In [1]: import sparsity
-   ...: import pandas as pd
-   ...: import numpy as np
-   ...: 
-
-In [2]: data = np.random.random(size=(1000,10))
-   ...: data[data < 0.95] = 0
-   ...: uids = np.random.randint(0,100,1000)
-   ...: combined_data = np.hstack([uids.reshape(-1,1),data])
-   ...: columns = ['id'] + list(map(str, range(10)))
-   ...: 
-   ...: sdf = pd.SparseDataFrame(combined_data, columns = columns, default_fill_value=0)
-   ...: 
-
-In [3]: %%timeit
-   ...: sdf.groupby('id').sum()
-   ...: 
-1 loop, best of 3: 462 ms per loop
-
-In [4]: res = sdf.groupby('id').sum()
-   ...: res.values.nbytes
-   ...: 
-Out[4]: 7920
-
-In [5]: data = np.random.random(size=(1000,10))
-   ...: data[data < 0.95] = 0
-   ...: uids = np.random.randint(0,100,1000)
-   ...: sdf = sparsity.SparseFrame(data, columns=np.asarray(list(map(str, range(10)))), index=uids)
-   ...: 
-
-In [6]: %%timeit
-   ...: sdf.groupby_sum()
-   ...: 
-The slowest run took 4.20 times longer than the fastest. This could mean that an intermediate result is being cached.
-1000 loops, best of 3: 1.25 ms per loop
-
-In [7]: res = sdf.groupby_sum()
-   ...: res.__sizeof__()
-   ...: 
-Out[7]: 6128
+## Installation
 ```
-
-I'm not quite sure if there is some cached result but I don't think so. This only uses a 
-smart csr matrix multiplication to do the operation.
+$ pip install sparsity
+```
diff --git a/circle.yml b/circle.yml
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,155 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+apidoc:
+	sphinx-apidoc -fME -o api ../sparsity
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/sparsity.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/sparsity.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/sparsity"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/sparsity"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/docs/api/dask-sparseframe-api.rst b/docs/api/dask-sparseframe-api.rst
@@ -0,0 +1,24 @@
+Dask SparseFrame API
+===============
+
+.. py:currentmodule:: sparsity.dask.core
+
+.. autosummary::
+    SparseFrame
+    SparseFrame.assign
+    SparseFrame.compute
+    SparseFrame.columns
+    SparseFrame.get_partition
+    SparseFrame.index
+    SparseFrame.join
+    SparseFrame.known_divisions
+    SparseFrame.map_partitions
+    SparseFrame.npartitions
+    SparseFrame.persist
+    SparseFrame.repartition
+    SparseFrame.set_index
+    SparseFrame.rename
+    SparseFrame.set_index
+    SparseFrame.sort_index
+    SparseFrame.to_delayed
+    SparseFrame.to_npz
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		include versioneer.py
		include sparsity/_version.py