From 040d538a5b5e5ead8f18c6356080aedb84d37bc8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 1 May 2019 13:19:34 -0700 Subject: [PATCH 1/2] Better string representation of cudf.MultiIndex object --- python/cudf/dataframe/multiindex.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/dataframe/multiindex.py b/python/cudf/dataframe/multiindex.py index fe93bed6a9d..28859e628bb 100644 --- a/python/cudf/dataframe/multiindex.py +++ b/python/cudf/dataframe/multiindex.py @@ -101,8 +101,10 @@ def _popn(self, n): return result def __repr__(self): + codes_repr = self.codes.as_matrix().T.tolist() return "MultiIndex(levels=" + str(self.levels) +\ - ",\ncodes=" + str(self.codes) + ")" + ",\n" +\ + " codes=" + str(codes_repr) + ")" @property def labels(self): From 9a11d5b690332c380fabbfdbf48156477c6e04bc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 1 May 2019 15:02:39 -0700 Subject: [PATCH 2/2] Add MultiIndex to 10min --- docs/source/10min.ipynb | 645 ++++++++++++++++++++++++---------------- 1 file changed, 381 insertions(+), 264 deletions(-) diff --git a/docs/source/10min.ipynb b/docs/source/10min.ipynb index 7470a2ce55e..f7783411b85 100644 --- a/docs/source/10min.ipynb +++ b/docs/source/10min.ipynb @@ -13,9 +13,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -48,20 +46,18 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " \n", "0 1\n", "1 2\n", "2 3\n", "3 \n", - "4 4\n" + "4 4\n", + "dtype: int64\n" ] } ], @@ -80,25 +76,23 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a b c\n", - " 0 0 19 0\n", - " 1 1 18 1\n", - " 2 2 17 2\n", - " 3 3 16 3\n", - " 4 4 15 4\n", - " 5 5 14 5\n", - " 6 6 13 6\n", - " 7 7 12 7\n", - " 8 8 11 8\n", - " 9 9 10 9\n", + " a b c\n", + "0 0 19 0\n", + "1 1 18 1\n", + "2 2 17 2\n", + "3 3 16 3\n", + "4 4 15 4\n", + "5 5 14 5\n", + "6 6 13 6\n", + "7 7 12 7\n", + "8 8 11 8\n", + "9 9 10 9\n", "[10 more rows]\n" ] } @@ -120,19 +114,17 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a b\n", - "0 0 0.1\n", - "1 1 0.2\n", - "2 2 \n", - "3 3 0.3\n" + " a b\n", + "0 0 0.1\n", + "1 1 0.2\n", + "2 2 \n", + "3 3 0.3\n" ] } ], @@ -160,17 +152,15 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a b c\n", - "0 0 19 0\n", - "1 1 18 1\n" + " a b c\n", + "0 0 19 0\n", + "1 1 18 1\n" ] } ], @@ -188,25 +178,23 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a b c\n", - "19 19 0 19\n", - "18 18 1 18\n", - "17 17 2 17\n", - "16 16 3 16\n", - "15 15 4 15\n", - "14 14 5 14\n", - "13 13 6 13\n", - "12 12 7 12\n", - "11 11 8 11\n", - "10 10 9 10\n", + " a b c\n", + "19 19 0 19\n", + "18 18 1 18\n", + "17 17 2 17\n", + "16 16 3 16\n", + "15 15 4 15\n", + "14 14 5 14\n", + "13 13 6 13\n", + "12 12 7 12\n", + "11 11 8 11\n", + "10 10 9 10\n", "[10 more rows]\n" ] } @@ -235,26 +223,24 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " \n", - " 0 0\n", - " 1 1\n", - " 2 2\n", - " 3 3\n", - " 4 4\n", - " 5 5\n", - " 6 6\n", - " 7 7\n", - " 8 8\n", - " 9 9\n", - "[10 more rows]\n" + "0 0\n", + "1 1\n", + "2 2\n", + "3 3\n", + "4 4\n", + "5 5\n", + "6 6\n", + "7 7\n", + "8 8\n", + "9 9\n", + "[10 more rows]\n", + "Name: a, dtype: int64\n" ] } ], @@ -279,19 +265,17 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a b\n", - "2 2 17\n", - "3 3 16\n", - "4 4 15\n", - "5 5 14\n" + " a b\n", + "2 2 17\n", + "3 3 16\n", + "4 4 15\n", + "5 5 14\n" ] } ], @@ -316,17 +300,15 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a b c\n", - "3 3 16 3\n", - "4 4 15 4\n" + " a b c\n", + "3 3 16 3\n", + "4 4 15 4\n" ] } ], @@ -344,9 +326,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -377,19 +357,17 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " \n", - "0 19\n", - "1 18\n", - "2 17\n", - "3 16\n" + "0 19\n", + "1 18\n", + "2 17\n", + "3 16\n", + "Name: b, dtype: int64\n" ] } ], @@ -407,16 +385,14 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a b c\n", - "16 16 3 16\n" + " a b c\n", + "16 16 3 16\n" ] } ], @@ -456,20 +432,18 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " \n", - "0 1\n", - "1 2\n", - "2 3\n", - "3 999\n", - "4 4\n" + "0 1\n", + "1 2\n", + "2 3\n", + "3 999\n", + "4 4\n", + "dtype: int64\n" ] } ], @@ -502,9 +476,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -535,26 +507,24 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " \n", - " 0 10\n", - " 1 11\n", - " 2 12\n", - " 3 13\n", - " 4 14\n", - " 5 15\n", - " 6 16\n", - " 7 17\n", - " 8 18\n", - " 9 19\n", - "[10 more rows]\n" + "0 10\n", + "1 11\n", + "2 12\n", + "3 13\n", + "4 14\n", + "5 15\n", + "6 16\n", + "7 17\n", + "8 18\n", + "9 19\n", + "[10 more rows]\n", + "Name: a, dtype: int64\n" ] } ], @@ -582,26 +552,24 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " \n", - " 0 1\n", - " 1 1\n", - " 2 1\n", - " 3 1\n", - " 4 1\n", - " 5 1\n", - " 6 1\n", - " 7 1\n", - " 8 1\n", - " 9 1\n", - "[10 more rows]\n" + "0 1\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 1\n", + "8 1\n", + "9 1\n", + "[10 more rows]\n", + "dtype: int64\n" ] } ], @@ -640,9 +608,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 4\n", + "0 1\n", + "1 2\n", + "2 3\n", + "3 \n", + "4 4\n", + "dtype: int64\n", + " a b c\n", + "0 0 19 0\n", + "1 1 18 1\n", + "2 2 17 2\n", + "3 3 16 3\n", + "4 4 15 4\n", + "5 0 19 0\n", + "6 1 18 1\n", + "7 2 17 2\n", + "8 3 16 3\n", + "9 4 15 4\n" + ] + } + ], "source": [ "print(cudf.concat([s, s]))\n", "print(cudf.concat([df.head(), df.head()], ignore_index=True))" @@ -664,24 +661,19 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, + "execution_count": 18, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1:float64\n", - "2:int64\n", - "3:float64\n", - " key vals_a vals_b\n", - "3 0 10.0 \n", - "0 1 11.0 10.0\n", - "1 2 12.0 11.0\n", - "4 3 13.0 \n", - "2 4 14.0 12.0\n" + " key vals_a vals_b\n", + "3 0 10.0 \n", + "0 1 11.0 10.0\n", + "1 2 12.0 11.0\n", + "4 3 13.0 \n", + "2 4 14.0 12.0\n" ] } ], @@ -714,26 +706,24 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, + "execution_count": 19, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " \n", - " 0 0\n", - " 1 1\n", - " 2 2\n", - " 3 3\n", - " 4 4\n", - " 5 19\n", - " 6 18\n", - " 7 17\n", - " 8 16\n", - " 9 15\n" + "0 0\n", + "1 1\n", + "2 2\n", + "3 3\n", + "4 4\n", + "5 19\n", + "6 18\n", + "7 17\n", + "8 16\n", + "9 15\n", + "dtype: int64\n" ] } ], @@ -757,10 +747,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": true - }, + "execution_count": 20, + "metadata": {}, "outputs": [], "source": [ "df['agg_col1'] = [1 if x % 2 == 0 else 0 for x in range(len(df))]\n", @@ -776,18 +764,17 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true - }, + "execution_count": 21, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " agg_col1 sum_a sum_b sum_c sum_agg_col2\n", - "0 0 100 90 100 3\n", - "1 1 90 100 90 4\n" + " a b c agg_col2\n", + "agg_col1\n", + "0 100 90 100 3\n", + "1 90 100 90 4\n" ] } ], @@ -804,25 +791,31 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": true - }, + "execution_count": 22, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " agg_col1 agg_col2 sum_a sum_b sum_c\n", - "0 0 0 73 60 73\n", - "1 0 1 27 30 27\n", - "2 1 0 54 60 54\n", - "3 1 1 36 40 36\n" + " a b c\n", + "agg_col1 agg_col2 \n", + "0 0 73 60 73\n", + " 1 27 30 27\n", + "1 0 54 60 54\n", + " 1 36 40 36\n" ] } ], "source": [ - "print(df.groupby(['agg_col1', 'agg_col2']).sum())" + "print(df.groupby(['agg_col1', 'agg_col2']).sum().to_pandas())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Printing `DataFrame`s containing a `MultiIndex` (see below for details) is not supported. This is why we converted to a Pandas DataFrame before printing." ] }, { @@ -834,18 +827,17 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true - }, + "execution_count": 23, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " agg_col1 mean_b sum_c max_a\n", - "0 0 9 100 19\n", - "1 1 10 90 18\n" + " a b c\n", + "agg_col1\n", + "0 19 9.0 100\n", + "1 18 10.0 90\n" ] } ], @@ -878,20 +870,18 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": true - }, + "execution_count": 24, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " date value\n", - "0 2018-11-20T00:00:00.000 0.15416284237967237\n", - "1 2018-11-21T00:00:00.000 0.7400496965154048\n", - "2 2018-11-22T00:00:00.000 0.26331501518513467\n", - "3 2018-11-23T00:00:00.000 0.5337393933802977\n" + " date value\n", + "0 2018-11-20T00:00:00.000 0.15416284237967237\n", + "1 2018-11-21T00:00:00.000 0.7400496965154048\n", + "2 2018-11-22T00:00:00.000 0.26331501518513467\n", + "3 2018-11-23T00:00:00.000 0.5337393933802977\n" ] } ], @@ -923,22 +913,20 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": true - }, + "execution_count": 25, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " grade id\n", - "0 a 1\n", - "1 b 2\n", - "2 b 3\n", - "3 a 4\n", - "4 a 5\n", - "5 e 6\n" + " id grade\n", + "0 1 a\n", + "1 2 b\n", + "2 3 b\n", + "3 4 a\n", + "4 5 a\n", + "5 6 e\n" ] } ], @@ -959,10 +947,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": true - }, + "execution_count": 26, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -985,22 +971,20 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, + "execution_count": 27, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " \n", "0 0\n", "1 1\n", "2 1\n", "3 0\n", "4 0\n", - "5 2\n" + "5 2\n", + "dtype: int8\n" ] } ], @@ -1040,10 +1024,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, + "execution_count": 28, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1078,10 +1060,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": true - }, + "execution_count": 29, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1110,10 +1090,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": true - }, + "execution_count": 30, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1143,22 +1121,38 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": true - }, + "execution_count": 31, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pyarrow.Table\n", - "None: int64\n", "a: int64\n", "b: int64\n", "c: int64\n", "agg_col1: int64\n", - "agg_col2: int64\n" + "agg_col2: int64\n", + "__index_level_0__: int64\n", + "metadata\n", + "--------\n", + "OrderedDict([(b'pandas',\n", + " b'{\"index_columns\": [\"__index_level_0__\"], \"column_indexes\": ['\n", + " b'{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\",'\n", + " b' \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}]'\n", + " b', \"columns\": [{\"name\": \"a\", \"field_name\": \"a\", \"pandas_type\"'\n", + " b': \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}, {\"name\"'\n", + " b': \"b\", \"field_name\": \"b\", \"pandas_type\": \"int64\", \"numpy_typ'\n", + " b'e\": \"int64\", \"metadata\": null}, {\"name\": \"c\", \"field_name\": '\n", + " b'\"c\", \"pandas_type\": \"int64\", \"numpy_type\": \"int64\", \"metadat'\n", + " b'a\": null}, {\"name\": \"agg_col1\", \"field_name\": \"agg_col1\", \"p'\n", + " b'andas_type\": \"int64\", \"numpy_type\": \"int64\", \"metadata\": nul'\n", + " b'l}, {\"name\": \"agg_col2\", \"field_name\": \"agg_col2\", \"pandas_t'\n", + " b'ype\": \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}, {\"n'\n", + " b'ame\": null, \"field_name\": \"__index_level_0__\", \"pandas_type\"'\n", + " b': \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}], \"panda'\n", + " b's_version\": \"0.24.2\"}')])\n" ] } ], @@ -1190,10 +1184,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": true - }, + "execution_count": 32, + "metadata": {}, "outputs": [], "source": [ "df.to_pandas().to_csv('foo.txt', index=False)" @@ -1208,26 +1200,24 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "collapsed": true - }, + "execution_count": 33, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a b c a1 a2\n", - " 0 0 19 0 1 1\n", - " 1 1 18 1 0 0\n", - " 2 2 17 2 1 0\n", - " 3 3 16 3 0 1\n", - " 4 4 15 4 1 0\n", - " 5 5 14 5 0 0\n", - " 6 6 13 6 1 1\n", - " 7 7 12 7 0 0\n", - " 8 8 11 8 1 0\n", - " 9 9 10 9 0 1\n", + " a b c a1 a2\n", + "0 0 19 0 1 1\n", + "1 1 18 1 0 0\n", + "2 2 17 2 1 0\n", + "3 3 16 3 0 1\n", + "4 4 15 4 1 0\n", + "5 5 14 5 0 0\n", + "6 6 13 6 1 1\n", + "7 7 12 7 0 0\n", + "8 8 11 8 1 0\n", + "9 9 10 9 0 1\n", "[10 more rows]\n" ] } @@ -1254,6 +1244,133 @@ "## ORC" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Indexing with MultiIndex" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "cuDF supports hierarchical indexing of `DataFrame`s using MultiIndex:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex(levels=[['a', 'b'], [1, 2, 3, 4]],\n", + " codes=[[0, 0, 1, 1], [0, 1, 2, 3]])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arrays = [['a', 'a', 'b', 'b'],\n", + " [1, 2, 3, 4]]\n", + "tuples = list(zip(*arrays))\n", + "idx = cudf.MultiIndex.from_tuples(tuples)\n", + "idx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This index can back either axis of a DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " first second\n", + "a 1 0.480047 0.073470\n", + " 2 0.888448 0.595152\n", + "b 3 0.208500 0.031151\n", + " 4 0.944581 0.665257\n" + ] + } + ], + "source": [ + "gdf1 = cudf.DataFrame({'first': np.random.rand(4), 'second': np.random.rand(4)})\n", + "\n", + "# DataFrame with MultiIndex on axis 0\n", + "gdf1.index = idx\n", + "print(gdf1.to_pandas())" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " a b \n", + " 1 2 3 4\n", + "0 0.637385 0.862465 0.941638 0.445627\n", + "1 0.669958 0.924331 0.619423 0.322586\n" + ] + } + ], + "source": [ + "gdf2 = cudf.DataFrame({'first': np.random.rand(4), 'second': np.random.rand(4)}).T\n", + "\n", + "# DataFrame with MultiIndex on axis 1\n", + "gdf2.columns = idx\n", + "print(gdf2.to_pandas())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Accessing values of a `DataFrame` with a `MultiIndex`. Note that slicing is not yet supported." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " first second\n", + "b 3 0.2085 0.031151\n" + ] + } + ], + "source": [ + "print(gdf1.loc[('b', 3)].to_pandas())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Grouping hierarchically (see \"Grouping\" above) automatically produces a `DataFrame` with a `MultiIndex`." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1301,9 +1418,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.6" + "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +}