Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
5adfd14
Created full suite of tests for MultiIndex
thomcom Mar 26, 2019
34f892d
Pass three tests, more to come.
thomcom Mar 28, 2019
176f72b
Pass 5 tests
thomcom Mar 28, 2019
109955f
Going to need a multiindex.py
thomcom Mar 28, 2019
e758015
Iloc implementation - working on passing in a dataframe of codes now …
thomcom Apr 1, 2019
46d7d8c
Add StringIndex and handle it properly with certain MultiIndex cases.
thomcom Apr 2, 2019
d9f6a84
Minor clean up of StringIndex
thomcom Apr 2, 2019
b26d584
Work through another test case
thomcom Apr 3, 2019
fdc691f
Pass more tests. Drop slice test for now. On to columns tests.
thomcom Apr 4, 2019
93a3af2
Add a pair of tests, clean up validity mask computation
thomcom Apr 4, 2019
7155245
Add semi-functional columnar MultiIndex tests and code. Fix a test_st…
thomcom Apr 16, 2019
39bbeac
Add a multitude of improved error messages and loc peformance
thomcom Apr 16, 2019
90b51b3
More tests and considerable cleanup. Row-wise MI is pretty close to f…
thomcom Apr 18, 2019
2fd91e1
Get MultiColumns working now
thomcom Apr 23, 2019
7d88562
Add a test to build a df with index and columns both set to MultiInde…
thomcom Apr 25, 2019
b2526fe
Clean up linter
thomcom Apr 25, 2019
63ec4fb
CHANGELOG
thomcom Apr 25, 2019
61e85cc
More style cleanup because my linter isn't working properly.
thomcom Apr 25, 2019
a436217
Merge branch 'branch-0.7' into multiindex-branch-0.7
thomcom Apr 25, 2019
4a16b33
Fix error with nvtx refactor and add string comparison back
thomcom Apr 25, 2019
b6824cf
Major improvements to groupby as_index MultiIndex. Next do MultiColumn
thomcom Apr 27, 2019
46fabf9
Merge branch 'branch-0.7' into fea-ext-multiindex
thomcom Apr 29, 2019
58d59b2
Hopefully have fixed circular dependency import issue. Add a new test…
thomcom Apr 29, 2019
3a937e8
Merge branch 'branch-0.7' into fea-ext-multiindex
thomcom Apr 29, 2019
4299d2b
Style
thomcom Apr 29, 2019
ac52c12
Merge branch 'branch-0.7' into fea-ext-multiindex
thomcom Apr 30, 2019
6dec1b1
Fix another _gdf merge issue
thomcom Apr 30, 2019
5fb3fbe
Add multicolumn for groupby!
thomcom Apr 30, 2019
9e41996
Clean up test_libgdf_groupby.py some more
thomcom Apr 30, 2019
d1e1242
Fix issue with version 23.4 vs 24.2 of pandas
thomcom May 1, 2019
f0c975d
Better support for pandas 0.23.4
thomcom May 1, 2019
5ed8a94
Merge branch 'branch-0.7' into fea-ext-multiindex
thomcom May 1, 2019
157dac5
Resolve review items raised by @kkraus14 except one
thomcom May 1, 2019
838d630
Resolve final concern of @kkraus14
thomcom May 1, 2019
36aa013
Some review items
thomcom May 2, 2019
f494b87
Resolve remaining review concerns
thomcom May 2, 2019
081adc2
Merge branch 'branch-0.7' into fea-ext-multiindex
thomcom May 2, 2019
5a7a575
Fix CI build error involving numba 0.41.0
thomcom May 2, 2019
be1db05
Accidentally used the wrong line.
thomcom May 2, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- PR #1466 Add GPU-accelerated ORC Reader
- PR #1565 Add build script for nightly doc builds
- PR #1508 Add Series isna, isnull, and notna
- PR #1301 MultiIndex support

## Improvements

Expand Down
Empty file modified ci/local/build.sh
100644 → 100755
Empty file.
1 change: 1 addition & 0 deletions cpp/src/binary/binary_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ gdf_error gdf_div_f64(gdf_column *lhs, gdf_column *rhs, gdf_column *output) {
gdf_error F##_generic(gdf_column *lhs, gdf_column *rhs, gdf_column *output) { \
switch ( lhs->dtype ) { \
case GDF_INT8: return F##_i8(lhs, rhs, output); \
case GDF_STRING_CATEGORY: \
case GDF_INT32: return F##_i32(lhs, rhs, output); \
case GDF_INT64: return F##_i64(lhs, rhs, output); \
case GDF_FLOAT32: return F##_f32(lhs, rhs, output); \
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright (c) 2018, NVIDIA CORPORATION.
# Copyright (c) 2018-2019, NVIDIA CORPORATION.

from cudf import dataframe
from cudf import datasets
from cudf.dataframe import DataFrame, from_pandas, merge
from cudf.dataframe import Index
from cudf.dataframe import Index, MultiIndex
from cudf.dataframe import Series
from cudf.multi import concat
from cudf.io import (read_csv, read_parquet, read_feather, read_json,
Expand Down
37 changes: 14 additions & 23 deletions python/cudf/bindings/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,10 @@ def agg(groupby_class, args):
sort_results=sort_results
)
add_col_values = False # we only want to add them once
# TODO: Do multindex here
if(groupby_class._as_index) and 1 == len(groupby_class._by):
idx = index.as_index(result[groupby_class._by[0]])
idx.name = groupby_class._by[0]
result = result.set_index(idx)
result.drop_column(idx.name)
if(groupby_class._as_index):
result = groupby_class.apply_multiindex_or_single_index(result)
if use_prefix:
result = groupby_class.apply_multicolumn(result, args)
elif isinstance(args, collections.abc.Mapping):
if (len(args.keys()) == 1):
if(len(list(args.values())[0]) == 1):
Expand Down Expand Up @@ -377,15 +375,13 @@ def agg(groupby_class, args):
sort_results=sort_results
)
add_col_values = False # we only want to add them once
# TODO: Do multindex here
if(groupby_class._as_index) and 1 == len(groupby_class._by):
idx = index.as_index(result[groupby_class._by[0]])
idx.name = groupby_class._by[0]
result = result.set_index(idx)
result.drop_column(idx.name)
if groupby_class._as_index:
result = groupby_class.apply_multiindex_or_single_index(result)
if use_prefix:
result = groupby_class.apply_multicolumn_mapped(result, args)
else:
result = groupby_class.agg([args])

free(ctx)

nvtx_range_pop()
Expand Down Expand Up @@ -431,18 +427,13 @@ def _apply_basic_agg(groupby_class, agg_type, sort_results=False):
else:
idx.name = groupby_class._by[0]
result_series = result_series.set_index(idx)
if groupby_class._as_index:
result = groupby_class.apply_multiindex_or_single_index(result)
result_series.index = result.index
return result_series

# TODO: Do MultiIndex here
if(groupby_class._as_index):
idx = index.as_index(result[groupby_class._by[0]])
idx.name = groupby_class._by[0]
result.drop_column(idx.name)
if groupby_class.level == 0:
idx.name = groupby_class._original_index_name
else:
idx.name = groupby_class._by[0]
result = result.set_index(idx)
if groupby_class._as_index:
result = groupby_class.apply_multiindex_or_single_index(result)

nvtx_range_pop()

Expand Down
3 changes: 3 additions & 0 deletions python/cudf/dataframe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# Copyright (c) 2018-2019, NVIDIA CORPORATION.

from cudf.dataframe import (buffer, dataframe, series,
index, numerical, datetime, categorical, string)

from cudf.dataframe.dataframe import DataFrame, from_pandas, merge
from cudf.dataframe.index import (Index, GenericIndex,
RangeIndex, DatetimeIndex, CategoricalIndex)
from cudf.dataframe.multiindex import MultiIndex
from cudf.dataframe.series import Series
from cudf.dataframe.buffer import Buffer
from cudf.dataframe.numerical import NumericalColumn
Expand Down
104 changes: 88 additions & 16 deletions python/cudf/dataframe/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018, NVIDIA CORPORATION.
# Copyright (c) 2018-2019, NVIDIA CORPORATION.

from __future__ import print_function, division

Expand Down Expand Up @@ -29,6 +29,7 @@

from librmm_cffi import librmm as rmm

import cudf
from cudf import formatting
from cudf.utils import cudautils, queryutils, applyutils, utils, ioutils
from cudf.dataframe.index import as_index, Index, RangeIndex
Expand Down Expand Up @@ -224,10 +225,14 @@ def __getitem__(self, arg):
>>> print(df[[True, False, True, False]]) # mask the entire dataframe,
# returning the rows specified in the boolean mask
"""
if isinstance(self.columns, cudf.dataframe.multiindex.MultiIndex) and\
isinstance(arg, tuple):
return self.columns._get_column_major(self, arg)
if isinstance(arg, str) or isinstance(arg, numbers.Integral) or \
isinstance(arg, tuple):
s = self._cols[arg]
s.name = arg
s.index = self.index
return s
elif isinstance(arg, slice):
df = DataFrame()
Expand All @@ -247,7 +252,7 @@ def __getitem__(self, arg):
index = self.index.take(selinds.to_gpu_array())
for col in self._cols:
df[col] = Series(self._cols[col][arg], index=index)
df.set_index(index)
df = df.set_index(index)
else:
for col in arg:
df[col] = self[col]
Expand All @@ -272,7 +277,6 @@ def mask(self, other):
def __setitem__(self, name, col):
"""Add/set column by *name or DataFrame*
"""
# div[div < 0] = 0
if isinstance(name, DataFrame):
for col_name in self._cols:
mask = name[col_name]
Expand Down Expand Up @@ -399,6 +403,11 @@ def to_string(self, nrows=NOTSET, ncols=NOTSET):
>>> df.to_string()
' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0'
"""
if isinstance(self.index, cudf.dataframe.multiindex.MultiIndex) or\
isinstance(self.columns, cudf.dataframe.multiindex.MultiIndex):
raise TypeError("You're trying to print a DataFrame that contains "
"a MultiIndex. Print this dataframe with "
".to_pandas()")
if nrows is NOTSET:
nrows = settings.formatting.get('nrows')
if ncols is NOTSET:
Expand All @@ -420,9 +429,12 @@ def to_string(self, nrows=NOTSET, ncols=NOTSET):
# Prepare cells
cols = OrderedDict()
dtypes = OrderedDict()
use_cols = list(self.columns[:ncols - 1])
if ncols > 0:
use_cols.append(self.columns[-1])
if hasattr(self, 'multi_cols'):
use_cols = list(range(len(self.columns)))
else:
use_cols = list(self.columns[:ncols - 1])
if ncols > 0:
use_cols.append(self.columns[-1])

for h in use_cols:
cols[h] = self[h].values_to_string(nrows=nrows)
Expand Down Expand Up @@ -664,19 +676,41 @@ def iloc(self):
def columns(self):
"""Returns a tuple of columns
"""
return pd.Index(self._cols)
if hasattr(self, 'multi_cols'):
return self.multi_cols
else:
return pd.Index(self._cols)

@columns.setter
def columns(self, columns):
if isinstance(columns, Index):
if len(columns) != len(self.columns):
msg = f"Length mismatch: Expected axis has %d elements, "\
"new values have %d elements"\
% (len(self.columns), len(columns))
raise ValueError(msg)
"""
new_names = []
for idx, name in enumerate(columns):
new_names.append(name)
self._rename_columns(new_names)
"""
self.multi_cols = columns
else:
if hasattr(self, 'multi_cols'):
delattr(self, 'multi_cols')
self._rename_columns(columns)

def _rename_columns(self, new_names):
old_cols = list(self._cols.keys())
l_old_cols = len(old_cols)
l_new_cols = len(columns)
l_new_cols = len(new_names)
if l_new_cols != l_old_cols:
msg = f'Length of new column names: {l_new_cols} does not ' \
'match length of previous column names: {l_old_cols}'
raise ValueError(msg)

mapper = dict(zip(old_cols, columns))
mapper = dict(zip(old_cols, new_names))
self.rename(mapper=mapper, inplace=True)

@property
Expand All @@ -687,12 +721,26 @@ def index(self):

@index.setter
def index(self, _index):
if isinstance(_index, cudf.dataframe.multiindex.MultiIndex):
if len(_index) != len(self[self.columns[0]]):
msg = f"Length mismatch: Expected axis has "\
"%d elements, new values "\
"have %d elements"\
% (len(self[self.columns[0]]), len(_index))
raise ValueError(msg)
self._index = _index
for k in self.columns:
self[k].index = _index
return

new_length = len(_index)
old_length = len(self._index)

if new_length != old_length:
msg = f'Length mismatch: Expected index has {old_length}' \
' elements, new values have {new_length} elements'
msg = f"Length mismatch: Expected axis has "\
"%d elements, new values "\
"have %d elements"\
% (old_length, new_length)
raise ValueError(msg)

# try to build an index from generic _index
Expand Down Expand Up @@ -906,8 +954,8 @@ def drop(self, labels, axis=None):
if axis == 0:
raise NotImplementedError("Can only drop columns, not rows")

columns = [labels] if isinstance(labels, str) else list(labels)

columns = [labels] if isinstance(
labels, (str, numbers.Number)) else list(labels)
outdf = self.copy()
for c in columns:
outdf._drop_column(c)
Expand Down Expand Up @@ -2240,6 +2288,13 @@ def to_pandas(self):
out = pd.DataFrame(index=index)
for c, x in self._cols.items():
out[c] = x.to_pandas(index=index)
if isinstance(self.columns, Index):
out.columns = self.columns
if isinstance(self.columns, cudf.dataframe.multiindex.MultiIndex):
if self.columns.names is not None:
out.columns.names = self.columns.names
else:
out.columns.name = self.columns.name
return out

@classmethod
Expand Down Expand Up @@ -2269,7 +2324,12 @@ def from_pandas(cls, dataframe, nan_as_null=True):
vals = dataframe[colk].values
df[colk] = Series(vals, nan_as_null=nan_as_null)
# Set index
return df.set_index(dataframe.index)
if isinstance(dataframe.index, pd.MultiIndex):
import cudf
index = cudf.from_pandas(dataframe.index)
else:
index = dataframe.index
return df.set_index(index)

def to_arrow(self, preserve_index=True):
"""
Expand Down Expand Up @@ -2696,6 +2756,13 @@ def __getitem__(self, arg):
row_slice = None
row_label = None

if isinstance(self._df.index, cudf.dataframe.multiindex.MultiIndex)\
and isinstance(arg, tuple): # noqa: E501
# Explicitly ONLY support tuple indexes into MultiIndex.
# Pandas allows non tuple indices and warns "results may be
# undefined."
return self._df._index._get_row_major(self._df, arg)

if isinstance(arg, int):
if arg < 0 or arg >= len(self._df):
raise IndexError("label scalar %s is out of bound" % arg)
Expand Down Expand Up @@ -2785,7 +2852,9 @@ def __setitem__(self, key, value):

def from_pandas(obj):
"""
Convert a Pandas DataFrame or Series object into the cudf equivalent
Convert certain Pandas objects into the cudf equivalent.

Supports DataFrame, Series, or MultiIndex.

Raises
------
Expand All @@ -2804,9 +2873,12 @@ def from_pandas(obj):
return DataFrame.from_pandas(obj)
elif isinstance(obj, pd.Series):
return Series.from_pandas(obj)
elif isinstance(obj, pd.MultiIndex):
return cudf.dataframe.multiindex.MultiIndex.from_pandas(obj)
else:
raise TypeError(
"from_pandas only accepts Pandas Dataframes and Series objects. "
"from_pandas only accepts Pandas Dataframes, Series, and "
"MultiIndex objects. "
"Got %s" % type(obj)
)

Expand Down
Loading