diff --git a/CHANGELOG.md b/CHANGELOG.md index f45d010b571..8108c40ec5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ - PR #1466 Add GPU-accelerated ORC Reader - PR #1565 Add build script for nightly doc builds - PR #1508 Add Series isna, isnull, and notna +- PR #1301 MultiIndex support ## Improvements diff --git a/ci/local/build.sh b/ci/local/build.sh old mode 100644 new mode 100755 diff --git a/cpp/src/binary/binary_ops.cu b/cpp/src/binary/binary_ops.cu index d2799c6b53c..f93b3dc339b 100644 --- a/cpp/src/binary/binary_ops.cu +++ b/cpp/src/binary/binary_ops.cu @@ -256,6 +256,7 @@ gdf_error gdf_div_f64(gdf_column *lhs, gdf_column *rhs, gdf_column *output) { gdf_error F##_generic(gdf_column *lhs, gdf_column *rhs, gdf_column *output) { \ switch ( lhs->dtype ) { \ case GDF_INT8: return F##_i8(lhs, rhs, output); \ + case GDF_STRING_CATEGORY: \ case GDF_INT32: return F##_i32(lhs, rhs, output); \ case GDF_INT64: return F##_i64(lhs, rhs, output); \ case GDF_FLOAT32: return F##_f32(lhs, rhs, output); \ diff --git a/python/cudf/__init__.py b/python/cudf/__init__.py index 3ade67c4d48..d7a499a5d69 100644 --- a/python/cudf/__init__.py +++ b/python/cudf/__init__.py @@ -1,9 +1,9 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. from cudf import dataframe from cudf import datasets from cudf.dataframe import DataFrame, from_pandas, merge -from cudf.dataframe import Index +from cudf.dataframe import Index, MultiIndex from cudf.dataframe import Series from cudf.multi import concat from cudf.io import (read_csv, read_parquet, read_feather, read_json, diff --git a/python/cudf/bindings/groupby.pyx b/python/cudf/bindings/groupby.pyx index 31a210d938c..94c1b31be52 100644 --- a/python/cudf/bindings/groupby.pyx +++ b/python/cudf/bindings/groupby.pyx @@ -334,12 +334,10 @@ def agg(groupby_class, args): sort_results=sort_results ) add_col_values = False # we only want to add them once - # TODO: Do multindex here - if(groupby_class._as_index) and 1 == len(groupby_class._by): - idx = index.as_index(result[groupby_class._by[0]]) - idx.name = groupby_class._by[0] - result = result.set_index(idx) - result.drop_column(idx.name) + if(groupby_class._as_index): + result = groupby_class.apply_multiindex_or_single_index(result) + if use_prefix: + result = groupby_class.apply_multicolumn(result, args) elif isinstance(args, collections.abc.Mapping): if (len(args.keys()) == 1): if(len(list(args.values())[0]) == 1): @@ -377,15 +375,13 @@ def agg(groupby_class, args): sort_results=sort_results ) add_col_values = False # we only want to add them once - # TODO: Do multindex here - if(groupby_class._as_index) and 1 == len(groupby_class._by): - idx = index.as_index(result[groupby_class._by[0]]) - idx.name = groupby_class._by[0] - result = result.set_index(idx) - result.drop_column(idx.name) + if groupby_class._as_index: + result = groupby_class.apply_multiindex_or_single_index(result) + if use_prefix: + result = groupby_class.apply_multicolumn_mapped(result, args) else: result = groupby_class.agg([args]) - + free(ctx) nvtx_range_pop() @@ -431,18 +427,13 @@ def _apply_basic_agg(groupby_class, agg_type, sort_results=False): else: idx.name = groupby_class._by[0] result_series = result_series.set_index(idx) + if groupby_class._as_index: + result = groupby_class.apply_multiindex_or_single_index(result) + result_series.index = result.index return result_series - # TODO: Do MultiIndex here - if(groupby_class._as_index): - idx = index.as_index(result[groupby_class._by[0]]) - idx.name = groupby_class._by[0] - result.drop_column(idx.name) - if groupby_class.level == 0: - idx.name = groupby_class._original_index_name - else: - idx.name = groupby_class._by[0] - result = result.set_index(idx) + if groupby_class._as_index: + result = groupby_class.apply_multiindex_or_single_index(result) nvtx_range_pop() diff --git a/python/cudf/dataframe/__init__.py b/python/cudf/dataframe/__init__.py index 85435163653..ae109fc2fa8 100644 --- a/python/cudf/dataframe/__init__.py +++ b/python/cudf/dataframe/__init__.py @@ -1,9 +1,12 @@ +# Copyright (c) 2018-2019, NVIDIA CORPORATION. + from cudf.dataframe import (buffer, dataframe, series, index, numerical, datetime, categorical, string) from cudf.dataframe.dataframe import DataFrame, from_pandas, merge from cudf.dataframe.index import (Index, GenericIndex, RangeIndex, DatetimeIndex, CategoricalIndex) +from cudf.dataframe.multiindex import MultiIndex from cudf.dataframe.series import Series from cudf.dataframe.buffer import Buffer from cudf.dataframe.numerical import NumericalColumn diff --git a/python/cudf/dataframe/dataframe.py b/python/cudf/dataframe/dataframe.py index 7549e907f23..0496c96f498 100644 --- a/python/cudf/dataframe/dataframe.py +++ b/python/cudf/dataframe/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. from __future__ import print_function, division @@ -29,6 +29,7 @@ from librmm_cffi import librmm as rmm +import cudf from cudf import formatting from cudf.utils import cudautils, queryutils, applyutils, utils, ioutils from cudf.dataframe.index import as_index, Index, RangeIndex @@ -224,10 +225,14 @@ def __getitem__(self, arg): >>> print(df[[True, False, True, False]]) # mask the entire dataframe, # returning the rows specified in the boolean mask """ + if isinstance(self.columns, cudf.dataframe.multiindex.MultiIndex) and\ + isinstance(arg, tuple): + return self.columns._get_column_major(self, arg) if isinstance(arg, str) or isinstance(arg, numbers.Integral) or \ isinstance(arg, tuple): s = self._cols[arg] s.name = arg + s.index = self.index return s elif isinstance(arg, slice): df = DataFrame() @@ -247,7 +252,7 @@ def __getitem__(self, arg): index = self.index.take(selinds.to_gpu_array()) for col in self._cols: df[col] = Series(self._cols[col][arg], index=index) - df.set_index(index) + df = df.set_index(index) else: for col in arg: df[col] = self[col] @@ -272,7 +277,6 @@ def mask(self, other): def __setitem__(self, name, col): """Add/set column by *name or DataFrame* """ - # div[div < 0] = 0 if isinstance(name, DataFrame): for col_name in self._cols: mask = name[col_name] @@ -399,6 +403,11 @@ def to_string(self, nrows=NOTSET, ncols=NOTSET): >>> df.to_string() ' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0' """ + if isinstance(self.index, cudf.dataframe.multiindex.MultiIndex) or\ + isinstance(self.columns, cudf.dataframe.multiindex.MultiIndex): + raise TypeError("You're trying to print a DataFrame that contains " + "a MultiIndex. Print this dataframe with " + ".to_pandas()") if nrows is NOTSET: nrows = settings.formatting.get('nrows') if ncols is NOTSET: @@ -420,9 +429,12 @@ def to_string(self, nrows=NOTSET, ncols=NOTSET): # Prepare cells cols = OrderedDict() dtypes = OrderedDict() - use_cols = list(self.columns[:ncols - 1]) - if ncols > 0: - use_cols.append(self.columns[-1]) + if hasattr(self, 'multi_cols'): + use_cols = list(range(len(self.columns))) + else: + use_cols = list(self.columns[:ncols - 1]) + if ncols > 0: + use_cols.append(self.columns[-1]) for h in use_cols: cols[h] = self[h].values_to_string(nrows=nrows) @@ -664,19 +676,41 @@ def iloc(self): def columns(self): """Returns a tuple of columns """ - return pd.Index(self._cols) + if hasattr(self, 'multi_cols'): + return self.multi_cols + else: + return pd.Index(self._cols) @columns.setter def columns(self, columns): + if isinstance(columns, Index): + if len(columns) != len(self.columns): + msg = f"Length mismatch: Expected axis has %d elements, "\ + "new values have %d elements"\ + % (len(self.columns), len(columns)) + raise ValueError(msg) + """ + new_names = [] + for idx, name in enumerate(columns): + new_names.append(name) + self._rename_columns(new_names) + """ + self.multi_cols = columns + else: + if hasattr(self, 'multi_cols'): + delattr(self, 'multi_cols') + self._rename_columns(columns) + + def _rename_columns(self, new_names): old_cols = list(self._cols.keys()) l_old_cols = len(old_cols) - l_new_cols = len(columns) + l_new_cols = len(new_names) if l_new_cols != l_old_cols: msg = f'Length of new column names: {l_new_cols} does not ' \ 'match length of previous column names: {l_old_cols}' raise ValueError(msg) - mapper = dict(zip(old_cols, columns)) + mapper = dict(zip(old_cols, new_names)) self.rename(mapper=mapper, inplace=True) @property @@ -687,12 +721,26 @@ def index(self): @index.setter def index(self, _index): + if isinstance(_index, cudf.dataframe.multiindex.MultiIndex): + if len(_index) != len(self[self.columns[0]]): + msg = f"Length mismatch: Expected axis has "\ + "%d elements, new values "\ + "have %d elements"\ + % (len(self[self.columns[0]]), len(_index)) + raise ValueError(msg) + self._index = _index + for k in self.columns: + self[k].index = _index + return + new_length = len(_index) old_length = len(self._index) if new_length != old_length: - msg = f'Length mismatch: Expected index has {old_length}' \ - ' elements, new values have {new_length} elements' + msg = f"Length mismatch: Expected axis has "\ + "%d elements, new values "\ + "have %d elements"\ + % (old_length, new_length) raise ValueError(msg) # try to build an index from generic _index @@ -906,8 +954,8 @@ def drop(self, labels, axis=None): if axis == 0: raise NotImplementedError("Can only drop columns, not rows") - columns = [labels] if isinstance(labels, str) else list(labels) - + columns = [labels] if isinstance( + labels, (str, numbers.Number)) else list(labels) outdf = self.copy() for c in columns: outdf._drop_column(c) @@ -2240,6 +2288,13 @@ def to_pandas(self): out = pd.DataFrame(index=index) for c, x in self._cols.items(): out[c] = x.to_pandas(index=index) + if isinstance(self.columns, Index): + out.columns = self.columns + if isinstance(self.columns, cudf.dataframe.multiindex.MultiIndex): + if self.columns.names is not None: + out.columns.names = self.columns.names + else: + out.columns.name = self.columns.name return out @classmethod @@ -2269,7 +2324,12 @@ def from_pandas(cls, dataframe, nan_as_null=True): vals = dataframe[colk].values df[colk] = Series(vals, nan_as_null=nan_as_null) # Set index - return df.set_index(dataframe.index) + if isinstance(dataframe.index, pd.MultiIndex): + import cudf + index = cudf.from_pandas(dataframe.index) + else: + index = dataframe.index + return df.set_index(index) def to_arrow(self, preserve_index=True): """ @@ -2696,6 +2756,13 @@ def __getitem__(self, arg): row_slice = None row_label = None + if isinstance(self._df.index, cudf.dataframe.multiindex.MultiIndex)\ + and isinstance(arg, tuple): # noqa: E501 + # Explicitly ONLY support tuple indexes into MultiIndex. + # Pandas allows non tuple indices and warns "results may be + # undefined." + return self._df._index._get_row_major(self._df, arg) + if isinstance(arg, int): if arg < 0 or arg >= len(self._df): raise IndexError("label scalar %s is out of bound" % arg) @@ -2785,7 +2852,9 @@ def __setitem__(self, key, value): def from_pandas(obj): """ - Convert a Pandas DataFrame or Series object into the cudf equivalent + Convert certain Pandas objects into the cudf equivalent. + + Supports DataFrame, Series, or MultiIndex. Raises ------ @@ -2804,9 +2873,12 @@ def from_pandas(obj): return DataFrame.from_pandas(obj) elif isinstance(obj, pd.Series): return Series.from_pandas(obj) + elif isinstance(obj, pd.MultiIndex): + return cudf.dataframe.multiindex.MultiIndex.from_pandas(obj) else: raise TypeError( - "from_pandas only accepts Pandas Dataframes and Series objects. " + "from_pandas only accepts Pandas Dataframes, Series, and " + "MultiIndex objects. " "Got %s" % type(obj) ) diff --git a/python/cudf/dataframe/index.py b/python/cudf/dataframe/index.py index e5450b0510f..a4275abba97 100644 --- a/python/cudf/dataframe/index.py +++ b/python/cudf/dataframe/index.py @@ -9,6 +9,7 @@ from numba.cuda.cudadrv.devicearray import DeviceNDArray from librmm_cffi import librmm as rmm +import nvstrings from cudf.dataframe import columnops from cudf.utils import cudautils, utils, ioutils @@ -205,7 +206,14 @@ def __ge__(self, other): def equals(self, other): if len(self) != len(other): return False - return (self == other)._values.all() + elif len(self) == 1: + return self[0] == other[0] + else: + result = (self == other) + if isinstance(result, bool): + return result + else: + return result._values.all() def join(self, other, method, how='left', return_indexers=False): column_join_res = self.as_column().join( @@ -564,6 +572,44 @@ def categories(self): return self._values.categories +class StringIndex(GenericIndex): + """String defined indices into another Column + + Attributes + --- + _values: A StringColumn object or NDArray of strings + name: A string + """ + + def __init__(self, values, name=None): + if isinstance(values, StringColumn): + self._values = values.copy() + elif isinstance(values, StringIndex): + if name is None: + name = values.name + self._values = values.values.copy() + else: + self._values = columnops.build_column(nvstrings.to_device(values), + dtype='object') + self.name = name + + @property + def codes(self): + return self._values.codes + + @property + def categories(self): + return self._values.categories + + def to_pandas(self): + result = pd.Index(self.values, name=self.name) + return result + + def __repr__(self): + return "{}({}, dtype='object', name={})".format( + self.__class__.__name__, self._values.to_array(), self.name) + + def as_index(arbitrary, name=None): """Create an Index from an arbitrary object @@ -590,14 +636,12 @@ def as_index(arbitrary, name=None): return arbitrary elif isinstance(arbitrary, NumericalColumn): return GenericIndex(arbitrary, name=name) + elif isinstance(arbitrary, StringColumn): + return StringIndex(arbitrary, name=name) elif isinstance(arbitrary, DatetimeColumn): return DatetimeIndex(arbitrary, name=name) elif isinstance(arbitrary, CategoricalColumn): return CategoricalIndex(arbitrary, name=name) - elif isinstance(arbitrary, StringColumn): - raise NotImplementedError( - "Strings are not yet supported in the index" - ) else: if hasattr(arbitrary, 'name') and name is None: name = arbitrary.name diff --git a/python/cudf/dataframe/multiindex.py b/python/cudf/dataframe/multiindex.py new file mode 100644 index 00000000000..fe93bed6a9d --- /dev/null +++ b/python/cudf/dataframe/multiindex.py @@ -0,0 +1,311 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. + +import pandas as pd +import numpy as np +import warnings + +from collections.abc import Sequence +from copy import copy, deepcopy + +from cudf.dataframe import columnops +from cudf.comm.serialize import register_distributed_serializer +from cudf.dataframe.index import Index, StringIndex + + +class MultiIndex(Index): + """A multi-level or hierarchical index. + + Provides N-Dimensional indexing into Series and DataFrame objects. + + Properties + --- + levels: Labels for each category in the index hierarchy. + codes: Assignment of individual items into the categories of the hierarchy. + names: Name for each level + """ + + def __init__(self, levels, codes=None, labels=None, names=None): + self.names = names + column_names = [] + if labels: + warnings.warn("the 'labels' keyword is deprecated, use 'codes' " + "instead", FutureWarning) + if labels and not codes: + codes = labels + if isinstance(names, (Sequence, + pd.core.indexes.frozen.FrozenNDArray, + pd.core.indexes.frozen.FrozenList)): + if sum(x is None for x in names) > 1: + column_names = list(range(len(codes))) + else: + column_names = names + elif names is None: + column_names = list(range(len(codes))) + else: + column_names = names + if len(codes) == 0: + raise ValueError('MultiIndex codes can not be empty.') + import cudf + if not isinstance(codes, cudf.dataframe.dataframe.DataFrame) and\ + not isinstance(codes[0], (Sequence, + pd.core.indexes.frozen.FrozenNDArray)): + raise TypeError('Codes is not a Sequence of sequences') + if not isinstance(codes, cudf.dataframe.dataframe.DataFrame): + self.codes = cudf.dataframe.dataframe.DataFrame() + for idx, code in enumerate(codes): + code = np.array(code) + self.codes.add_column(column_names[idx], + columnops.as_column(code)) + else: + self.codes = codes + self.levels = levels + self._validate_levels_and_codes(self.levels, self.codes) + self.name = None + self.names = names + + def _validate_levels_and_codes(self, levels, codes): + levels = np.array(levels) + if len(levels) != len(codes.columns): + raise ValueError('MultiIndex has unequal number of levels and ' + 'codes and is inconsistent!') + code_length = len(codes[codes.columns[0]]) + for index, code in enumerate(codes): + if code_length != len(codes[code]): + raise ValueError('MultiIndex length of codes does not match ' + 'and is inconsistent!') + for index, code in enumerate(codes): + if codes[code].max() > len(levels[index])-1: + raise ValueError('MultiIndex code %d contains value %d larger ' + 'than maximum level size at this position') + + def copy(self, deep=True): + if(deep): + result = deepcopy(self) + else: + result = copy(self) + result.name = self.name + return result + + def _popn(self, n): + """ Returns a copy of this index without the left-most n values. + + Removes n names, labels, and codes in order to build a new index + for results. + """ + from cudf import DataFrame + codes = DataFrame() + for idx in self.codes.columns[n:]: + codes.add_column(idx, self.codes[idx]) + result = MultiIndex(self.levels[n:], codes) + result.names = self.names[n:] + return result + + def __repr__(self): + return "MultiIndex(levels=" + str(self.levels) +\ + ",\ncodes=" + str(self.codes) + ")" + + @property + def labels(self): + warnings.warn("This feature is deprecated in pandas and will be" + "dropped from cudf as well.", FutureWarning) + return self.codes + + def _compute_validity_mask(self, df, row_tuple): + """ Computes the valid set of indices of values in the lookup + """ + validity_mask = [] + for i, element in enumerate(row_tuple): + index_of_code_at_level = None + for level_index in range(len(self.levels[i])): + if self.levels[i][level_index] == element: + index_of_code_at_level = level_index + break + if index_of_code_at_level is None: + raise KeyError(element) + matches = [] + for k, code in enumerate(self.codes[self.codes.columns[i]]): + if k in validity_mask or len(validity_mask) == 0: + if code == index_of_code_at_level: + matches.append(k) + if len(matches) != 0: + validity_mask = matches + return validity_mask + + def _get_row_major(self, df, row_tuple): + valid_indices = self._compute_validity_mask(df, row_tuple) + from cudf import Series + result = df.take(Series(valid_indices)) + # Build new index - INDEX based MultiIndex + # --------------- + from cudf import DataFrame + out_index = DataFrame() + # Select the last n-k columns where n is the number of source + # levels and k is the length of the indexing tuple + for k in range(len(row_tuple), len(df.index.levels)): + out_index.add_column(df.index.names[k], + df.index.codes[df.index.codes.columns[k]]) + # If there's only one column remaining in the output index, convert + # it into a StringIndex and name the final index values according + # to the proper codes. + if len(out_index.columns) == 1: + out_index = [] + for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]: # noqa: E501 + out_index.append(result.index.levels[ + len(result.index.codes.columns)-1][val]) + # TODO: Warning! The final index column could be arbitrarily + # ordered integers, not Strings, so we need to check for that + # dtype and produce a GenericIndex instead of a StringIndex + out_index = StringIndex(out_index) + out_index.name = result.index.names[len(result.index.names)-1] + result.index = out_index + else: + # Otherwise pop the leftmost levels, names, and codes from the + # source index until it has the correct number of columns (n-k) + if(len(out_index.columns)) > 0: + result.reset_index(drop=True) + result.index = result.index._popn(len(row_tuple)) + return result + + def _get_column_major(self, df, row_tuple): + valid_indices = self._compute_validity_mask(df, row_tuple) + from cudf import DataFrame + result = DataFrame() + for ix, col in enumerate(df.columns): + if ix in valid_indices: + result[ix] = list(df._cols.values())[ix] + # Build new index - COLUMN based MultiIndex + # --------------- + if len(row_tuple) < len(self.levels): + columns = self._popn(len(row_tuple)) + result.columns = columns.take(valid_indices) + else: + result.columns = self.take(valid_indices) + if len(result.columns.levels) == 1: + columns = [] + for code in result.columns.codes[result.columns.codes.columns[0]]: + columns.append(result.columns.levels[0][code]) + name = result.columns.names[0] + result.columns = StringIndex(columns, name=name) + return result + + def __len__(self): + return len(self.codes[self.codes.columns[0]]) + + def __eq__(self, other): + return self.levels == other.levels and\ + self.codes == other.codes and\ + self.names == other.names + + @property + def is_contiguous(self): + return True + + @property + def size(self): + return len(self.codes[0]) + + def take(self, indices): + from collections.abc import Sequence + from cudf import Series + from numbers import Integral + if isinstance(indices, (Integral, Sequence)): + indices = np.array(indices) + elif isinstance(indices, Series): + indices = indices.to_gpu_array() + codes = self.codes.take(indices) + result = MultiIndex(self.levels, codes) + result.names = self.names + return result + + def __iter__(self): + self.n = 0 + return self + + def __next__(self): + if self.n < len(self.codes): + result = self[self.n] + self.n += 1 + return result + else: + raise StopIteration + + def __getitem__(self, index): + match = self.take(index) + result = [] + for level, item in enumerate(match.codes): + result.append(match.levels[level][match.codes[item][0]]) + return tuple(result) + + @property + def _values(self): + return list([i for i in self]) + + @classmethod + def from_tuples(cls, tuples, names=None): + # Use Pandas for handling Python host objects + pdi = pd.MultiIndex.from_tuples(tuples, names=names) + result = cls.from_pandas(pdi) + return result + + @classmethod + def from_frame(cls, dataframe, names=None): + # Use Pandas for handling Python host objects + pdi = pd.MultiIndex.from_frame(dataframe.to_pandas(), names=names) + result = cls.from_pandas(pdi) + return result + + @classmethod + def from_product(cls, arrays, names=None): + # Use Pandas for handling Python host objects + pdi = pd.MultiIndex.from_product(arrays, names=names) + result = cls.from_pandas(pdi) + return result + + def to_pandas(self): + pandas_codes = [] + for code in self.codes.columns: + pandas_codes.append(self.codes[code].to_array()) + # Backwards compatibility: + # Construct a dummy MultiIndex and check for the codes attr. + # This indicates that it is pandas >= 0.24 + # If no codes attr is present it is pandas <= 0.23 + if hasattr(pd.MultiIndex([[]], [[]]), 'codes'): + return pd.MultiIndex(levels=self.levels, codes=pandas_codes, + names=self.names) + else: + return pd.MultiIndex(levels=self.levels, labels=pandas_codes, + names=self.names) + + @classmethod + def from_pandas(cls, multiindex): + """ + Convert from a Pandas MultiIndex + + Raises + ------ + TypeError for invalid input type. + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> pmi = pd.MultiIndex(levels=[['a', 'b'], ['c', 'd']], + codes=[[0, 1], [1, ]]) + >>> cudf.from_pandas(pmi) + MultiIndex( ... ) + """ + if not isinstance(multiindex, pd.MultiIndex): + raise TypeError('not a pandas.MultiIndex') + + if hasattr(multiindex, 'codes'): + mi = cls(levels=multiindex.levels, + codes=multiindex.codes, + names=multiindex.names) + else: + mi = cls(levels=multiindex.levels, + codes=multiindex.labels, + names=multiindex.names) + return mi + + +register_distributed_serializer(MultiIndex) diff --git a/python/cudf/dataframe/numerical.py b/python/cudf/dataframe/numerical.py index 0e3fde23341..c5f5b5e8cc2 100644 --- a/python/cudf/dataframe/numerical.py +++ b/python/cudf/dataframe/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. from __future__ import print_function, division diff --git a/python/cudf/dataframe/series.py b/python/cudf/dataframe/series.py index e7a0efb3d9c..f0aa12186d1 100644 --- a/python/cudf/dataframe/series.py +++ b/python/cudf/dataframe/series.py @@ -84,7 +84,6 @@ def __init__(self, data=None, index=None, name=None, nan_as_null=True, if index is not None and not isinstance(index, Index): index = as_index(index) - assert isinstance(data, columnops.TypedColumnBase) self._column = data self._index = RangeIndex(len(data)) if index is None else index @@ -296,7 +295,11 @@ def __getitem__(self, arg): def take(self, indices, ignore_index=False): """Return Series by taking values from the corresponding *indices*. """ - indices = Buffer(indices).to_gpu_array() + from cudf import Series + if isinstance(indices, Series): + indices = indices.to_gpu_array() + else: + indices = Buffer(indices).to_gpu_array() # Handle zero size if indices.size == 0: return self._copy_construct(data=self.data[:0], @@ -767,6 +770,10 @@ def index(self): """ return self._index + @index.setter + def index(self, _index): + self._index = _index + @property def iloc(self): """ diff --git a/python/cudf/dataframe/string.py b/python/cudf/dataframe/string.py index e27258fa77e..8a640552d2c 100644 --- a/python/cudf/dataframe/string.py +++ b/python/cudf/dataframe/string.py @@ -12,7 +12,9 @@ from cudf.dataframe.buffer import Buffer from cudf.utils import utils, cudautils +import cudf.bindings.binops as cpp_binops from cudf.bindings.cudf_cpp import get_ctype_ptr +from cudf.bindings.nvtx import nvtx_range_push, nvtx_range_pop from librmm_cffi import librmm as rmm @@ -335,6 +337,9 @@ def __init__(self, data, null_count=None, **kwargs): null_count : int; optional The number of null values in the mask. """ + from collections.abc import Sequence + if isinstance(data, Sequence): + data = nvstrings.to_device(data) assert isinstance(data, nvstrings.nvstrings) self._data = data self._dtype = np.dtype("object") @@ -522,6 +527,9 @@ def copy(self, deep=True): params = self._replace_defaults() return type(self)(**params) + def unordered_compare(self, cmpop, rhs): + return string_column_binop(self, rhs, op=cmpop) + def fillna(self, fill_value, inplace=False): """ Fill null values with * fill_value * @@ -548,3 +556,16 @@ def fillna(self, fill_value, inplace=False): result = StringColumn(filled_data) result = result.replace(mask=None) return self._mimic_inplace(result, inplace) + + +def string_column_binop(lhs, rhs, op): + nvtx_range_push("CUDF_BINARY_OP", "orange") + # Allocate output + masked = lhs.has_null_mask or rhs.has_null_mask + out = columnops.column_empty_like(lhs, dtype='bool', masked=masked) + # Call and fix null_count + null_count = cpp_binops.apply_op(lhs=lhs, rhs=rhs, out=out, op=op) + + result = out.replace(null_count=null_count) + nvtx_range_pop() + return result diff --git a/python/cudf/groupby/groupby.py b/python/cudf/groupby/groupby.py index 82fde49aaf9..196c5025891 100644 --- a/python/cudf/groupby/groupby.py +++ b/python/cudf/groupby/groupby.py @@ -1,5 +1,7 @@ # Copyright (c) 2018, NVIDIA CORPORATION. +import numpy as np + from numbers import Number from cudf.dataframe.dataframe import DataFrame @@ -110,6 +112,68 @@ def _apply_basic_agg(self, agg_type, sort_results=False): """ return _cpp_apply_basic_agg(self, agg_type, sort_results=sort_results) + def apply_multiindex_or_single_index(self, result): + if len(self._by) == 1: + from cudf.dataframe import index + idx = index.as_index(result[self._by[0]]) + idx.name = self._by[0] + result = result.drop(idx.name) + if idx.name == self._LEVEL_0_INDEX_NAME: + idx.name = None + result = result.set_index(idx) + return result + else: + levels = [] + codes = DataFrame() + names = [] + # Note: This is an O(N^2) solution using gpu masking + # to compute new codes for the MultiIndex. There may be + # a faster solution that could be executed on gpu at the same + # time the groupby is calculated. + for by in self._by: + level = result[by].unique() + code = result[by] + for idx, value in enumerate(level): + level_mask = code == value + code = code.masked_assign(idx, level_mask) + levels.append(level) + codes[by] = code + names.append(by) + from cudf import MultiIndex + multi_index = MultiIndex(levels=levels, + codes=codes, + names=names) + final_result = DataFrame() + for col in result.columns: + if col not in self._by: + final_result[col] = result[col] + return final_result.set_index(multi_index) + + def apply_multicolumn(self, result, aggs): + levels = [] + codes = [] + levels.append(self._val_columns) + levels.append(aggs) + codes.append(list(np.zeros(len(aggs), dtype='int64'))) + codes.append(list(range(len(aggs)))) + from cudf import MultiIndex + result.columns = MultiIndex(levels, codes) + return result + + def apply_multicolumn_mapped(self, result, aggs): + if len(set(aggs.keys())) == len(aggs.keys()) and\ + isinstance(aggs[list(aggs.keys())[0]], (str, Number)): + result.columns = aggs.keys() + else: + tuples = [] + for k in aggs.keys(): + for v in aggs[k]: + tuples.append((k, v)) + from cudf import MultiIndex + multiindex = MultiIndex.from_tuples(tuples) + result.columns = multiindex + return result + def __getitem__(self, arg): if isinstance(arg, (str, Number)): if arg not in self._val_columns: diff --git a/python/cudf/tests/test_groupby.py b/python/cudf/tests/test_groupby.py index 7dbab319e34..8206789f57c 100644 --- a/python/cudf/tests/test_groupby.py +++ b/python/cudf/tests/test_groupby.py @@ -75,16 +75,16 @@ def test_groupby_getitem_styles(): @pytest.mark.parametrize('nelem', get_nelem()) @pytest.mark.parametrize('method', get_methods()) def test_groupby_mean(nelem, method): - # gdf got_df = make_frame(DataFrame, nelem=nelem).groupby( ['x', 'y'], method=method).mean() - got = np.sort(got_df['val'].to_array()) - # pandas expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby(['x', 'y']).mean() - expect = np.sort(expect_df['val'].values) - # verify - np.testing.assert_array_almost_equal(expect, got) + if method == "cudf": + got = np.sort(got_df['val'].to_array()) + expect = np.sort(expect_df['val'].values) + np.testing.assert_array_almost_equal(expect, got) + else: + assert_eq(got_df, expect_df) @pytest.mark.parametrize('nelem', get_nelem()) @@ -92,38 +92,35 @@ def test_groupby_mean(nelem, method): def test_groupby_mean_3level(nelem, method): lvls = 'z' bys = list('xyz') - # gdf got_df = make_frame(DataFrame, nelem=nelem, extra_levels=lvls).groupby(bys, method=method).mean() - got = np.sort(got_df['val'].to_array()) - # pandas expect_df = make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls).groupby(bys).mean() - expect = np.sort(expect_df['val'].values) - # verify - np.testing.assert_array_almost_equal(expect, got) + if method == "cudf": + got = np.sort(got_df['val'].to_array()) + expect = np.sort(expect_df['val'].values) + np.testing.assert_array_almost_equal(expect, got) + else: + assert_eq(got_df, expect_df) @pytest.mark.parametrize('nelem', get_nelem()) @pytest.mark.parametrize('method', get_methods()) def test_groupby_agg_mean_min(nelem, method): - # gdf (Note: lack of multindex) got_df = make_frame(DataFrame, nelem=nelem).groupby( ['x', 'y'], method=method).agg(['mean', 'min']) + expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby( + ['x', 'y']).agg(['mean', 'min']) if method == "cudf": got_mean = np.sort(got_df['val_mean'].to_array()) got_min = np.sort(got_df['val_min'].to_array()) + expect_mean = np.sort(expect_df['val', 'mean'].values) + expect_min = np.sort(expect_df['val', 'min'].values) + # verify + np.testing.assert_array_almost_equal(expect_mean, got_mean) + np.testing.assert_array_almost_equal(expect_min, got_min) else: - got_mean = np.sort(got_df['mean_val'].to_array()) - got_min = np.sort(got_df['min_val'].to_array()) - # pandas - expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby( - ['x', 'y']).agg(['mean', 'min']) - expect_mean = np.sort(expect_df['val', 'mean'].values) - expect_min = np.sort(expect_df['val', 'min'].values) - # verify - np.testing.assert_array_almost_equal(expect_mean, got_mean) - np.testing.assert_array_almost_equal(expect_min, got_min) + assert_eq(expect_df, got_df) @pytest.mark.parametrize('nelem', get_nelem()) @@ -132,20 +129,18 @@ def test_groupby_agg_min_max_dictargs(nelem, method): # gdf (Note: lack of multindex) got_df = make_frame(DataFrame, nelem=nelem, extra_vals='ab').groupby( ['x', 'y'], method=method).agg({'a': 'min', 'b': 'max'}) + expect_df = make_frame(pd.DataFrame, nelem=nelem, extra_vals='ab').groupby( + ['x', 'y']).agg({'a': 'min', 'b': 'max'}) if method == "cudf": got_min = np.sort(got_df['a'].to_array()) got_max = np.sort(got_df['b'].to_array()) + expect_min = np.sort(expect_df['a'].values) + expect_max = np.sort(expect_df['b'].values) + # verify + np.testing.assert_array_almost_equal(expect_min, got_min) + np.testing.assert_array_almost_equal(expect_max, got_max) else: - got_min = np.sort(got_df['min_a'].to_array()) - got_max = np.sort(got_df['max_b'].to_array()) - # pandas - expect_df = make_frame(pd.DataFrame, nelem=nelem, extra_vals='ab').groupby( - ['x', 'y']).agg({'a': 'min', 'b': 'max'}) - expect_min = np.sort(expect_df['a'].values) - expect_max = np.sort(expect_df['b'].values) - # verify - np.testing.assert_array_almost_equal(expect_min, got_min) - np.testing.assert_array_almost_equal(expect_max, got_max) + assert_eq(expect_df, got_df) @pytest.mark.parametrize('method', get_methods()) @@ -274,8 +269,6 @@ def emulate(df): 'max', 'count', 'sum']) @pytest.mark.parametrize('method', get_methods()) def test_groupby_cudf_2keys_agg(nelem, func, method): - # gdf (Note: lack of multindex) - # skip unimplemented aggs: if func in ['var', 'std']: if method in ['hash', 'sort']: @@ -288,10 +281,12 @@ def test_groupby_cudf_2keys_agg(nelem, func, method): # pandas expect_df = make_frame(pd.DataFrame, nelem=nelem)\ .groupby(['x', 'y']).agg(func) - - expect_agg = np.sort(expect_df['val'].values) - # verify - np.testing.assert_array_almost_equal(expect_agg, got_agg) + if method == 'cudf': + expect_agg = np.sort(expect_df['val'].values) + # verify + np.testing.assert_array_almost_equal(expect_agg, got_agg) + else: + assert_eq(got_df, expect_df) @pytest.mark.parametrize('agg', ['min', 'max', 'count', 'sum', 'mean']) diff --git a/python/cudf/tests/test_libgdf_groupby.py b/python/cudf/tests/test_libgdf_groupby.py index 97e0202cb85..03a059a5f72 100644 --- a/python/cudf/tests/test_libgdf_groupby.py +++ b/python/cudf/tests/test_libgdf_groupby.py @@ -6,6 +6,7 @@ import pandas as pd from cudf.dataframe import DataFrame +from cudf.tests.utils import assert_eq def make_frame(dataframe_class, nelem, seed=0, extra_levels=(), extra_vals=()): @@ -27,104 +28,58 @@ def make_frame(dataframe_class, nelem, seed=0, extra_levels=(), extra_vals=()): @pytest.mark.parametrize('nelem', [2, 3, 100, 1000]) def test_groupby_mean(nelem): - # gdf got_df = make_frame(DataFrame, nelem=nelem).groupby( ['x', 'y'], method="hash").mean() - got = np.sort(got_df['val'].to_array()) - # pandas expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby(['x', 'y']).mean() - expect = np.sort(expect_df['val'].values) - # verify - np.testing.assert_array_almost_equal(expect, got) + assert_eq(got_df, expect_df) @pytest.mark.parametrize('nelem', [2, 3, 100, 1000]) def test_groupby_mean_3level(nelem): lvls = 'z' bys = list('xyz') - # gdf got_df = make_frame(DataFrame, nelem=nelem, extra_levels=lvls)\ .groupby(bys, method="hash").mean() - got = np.sort(got_df['val'].to_array()) - # pandas expect_df = make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls).groupby(bys).mean() - expect = np.sort(expect_df['val'].values) - # verify - np.testing.assert_array_almost_equal(expect, got) + assert_eq(got_df, expect_df) @pytest.mark.parametrize('nelem', [2, 3, 100, 1000]) def test_groupby_agg_mean_min(nelem): - # gdf (Note: lack of multindex) got_df = make_frame(DataFrame, nelem=nelem).groupby( ['x', 'y'], method="hash").agg(['mean', 'min']) - got_mean = np.sort(got_df['mean_val'].to_array()) - got_min = np.sort(got_df['min_val'].to_array()) - # pandas expect_df = make_frame(pd.DataFrame, nelem=nelem).groupby(['x', 'y'])\ .agg(['mean', 'min']) - expect_mean = np.sort(expect_df['val', 'mean'].values) - expect_min = np.sort(expect_df['val', 'min'].values) - # verify - np.testing.assert_array_almost_equal(expect_mean, got_mean) - np.testing.assert_array_almost_equal(expect_min, got_min) + assert_eq(got_df, expect_df) @pytest.mark.parametrize('nelem', [2, 3, 100, 1000]) def test_groupby_agg_min_max_dictargs(nelem): - # gdf (Note: lack of multindex) - got_df = make_frame(DataFrame, nelem=nelem, extra_vals='ab').groupby( - ['x', 'y'], method="hash").agg({'a': 'min', 'b': 'max'}) - got_min = np.sort(got_df['min_a'].to_array()) - got_max = np.sort(got_df['max_b'].to_array()) - # pandas expect_df = make_frame(pd.DataFrame, nelem=nelem, extra_vals='ab').groupby( ['x', 'y']).agg({'a': 'min', 'b': 'max'}) - expect_min = np.sort(expect_df['a'].values) - expect_max = np.sort(expect_df['b'].values) - # verify - np.testing.assert_array_almost_equal(expect_min, got_min) - np.testing.assert_array_almost_equal(expect_max, got_max) + got_df = make_frame(DataFrame, nelem=nelem, extra_vals='ab').groupby( + ['x', 'y'], method="hash").agg({'a': 'min', 'b': 'max'}) + assert_eq(expect_df, got_df) @pytest.mark.parametrize('nelem', [2, 3, 100, 1000]) def test_groupby_agg_min_max_dictlist(nelem): - # gdf (Note: lack of multindex) + expect_df = make_frame(pd.DataFrame, nelem=nelem, extra_vals='ab').groupby( + ['x', 'y']).agg({'a': ['min', 'max'], 'b': ['min', 'max']}) got_df = make_frame(DataFrame, nelem=nelem, extra_vals='ab').groupby( ['x', 'y'], method="hash").agg({'a': ['min', 'max'], 'b': ['min', 'max']}) - got_min_a = np.sort(got_df['min_a'].to_array()) - got_max_a = np.sort(got_df['max_a'].to_array()) - got_min_b = np.sort(got_df['min_b'].to_array()) - got_max_b = np.sort(got_df['max_b'].to_array()) - # pandas - expect_df = make_frame(pd.DataFrame, nelem=nelem, extra_vals='ab').groupby( - ['x', 'y']).agg({'a': ['min', 'max'], 'b': ['min', 'max']}) - expect_min_a = np.sort(expect_df['a']['min'].values) - expect_max_a = np.sort(expect_df['a']['max'].values) - expect_min_b = np.sort(expect_df['b']['min'].values) - expect_max_b = np.sort(expect_df['b']['max'].values) - # verify - np.testing.assert_array_almost_equal(expect_min_a, got_min_a) - np.testing.assert_array_almost_equal(expect_max_a, got_max_a) - np.testing.assert_array_almost_equal(expect_min_b, got_min_b) - np.testing.assert_array_almost_equal(expect_max_b, got_max_b) + assert_eq(got_df, expect_df) @pytest.mark.parametrize('nelem', [2, 3, 100, 1000]) @pytest.mark.parametrize('func', ['mean', 'min', 'max', 'count', 'sum']) def test_groupby_2keys_agg(nelem, func): # gdf (Note: lack of multindex) - got_df = make_frame(DataFrame, nelem=nelem)\ - .groupby(['x', 'y'], method="hash").agg(func) - - got_agg = np.sort(got_df['val'].to_array()) - # pandas expect_df = make_frame(pd.DataFrame, nelem=nelem)\ .groupby(['x', 'y']).agg(func) - - expect_agg = np.sort(expect_df['val'].values) - # verify - np.testing.assert_array_almost_equal(expect_agg, got_agg) + got_df = make_frame(DataFrame, nelem=nelem)\ + .groupby(['x', 'y'], method="hash").agg(func) + assert_eq(got_df, expect_df) diff --git a/python/cudf/tests/test_multiindex.py b/python/cudf/tests/test_multiindex.py new file mode 100644 index 00000000000..fe56f771f50 --- /dev/null +++ b/python/cudf/tests/test_multiindex.py @@ -0,0 +1,397 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. + +""" +Test related to MultiIndex +""" +import pytest + +import cudf +import numpy as np +import pandas as pd + +from cudf.tests.utils import assert_eq + + +def test_multiindex_levels_codes_validation(): + levels = [['a', 'b'], ['c', 'd']] + # Codes not a sequence of sequences + with pytest.raises(TypeError): + pd.MultiIndex(levels, [0, 1]) + with pytest.raises(TypeError): + cudf.MultiIndex(levels, [0, 1]) + # Codes don't match levels + with pytest.raises(ValueError): + pd.MultiIndex(levels, [[0], [1], [1]]) + with pytest.raises(ValueError): + cudf.MultiIndex(levels, [[0], [1], [1]]) + # Largest code greater than number of levels + with pytest.raises(ValueError): + pd.MultiIndex(levels, [[0, 1], [0, 2]]) + with pytest.raises(ValueError): + cudf.MultiIndex(levels, [[0, 1], [0, 2]]) + # Unequal code lengths + with pytest.raises(ValueError): + pd.MultiIndex(levels, [[0, 1], [0]]) + with pytest.raises(ValueError): + cudf.MultiIndex(levels, [[0, 1], [0]]) + # Didn't pass levels and codes + with pytest.raises(TypeError): + pd.MultiIndex() + with pytest.raises(TypeError): + cudf.MultiIndex() + # Didn't pass non zero levels and codes + with pytest.raises(ValueError): + pd.MultiIndex([], []) + with pytest.raises(ValueError): + cudf.MultiIndex([], []) + + +def test_multiindex_construction(): + levels = [['a', 'b'], ['c', 'd']] + codes = [[0, 1], [1, 0]] + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels, codes) + assert_eq(pmi, mi) + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels=levels, codes=codes) + assert_eq(pmi, mi) + + +def test_multiindex_types(): + codes = [[0, 1], [1, 0]] + levels = [[0, 1], [2, 3]] + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels, codes) + assert_eq(pmi, mi) + levels = [[1.2, 2.1], [1.3, 3.1]] + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels, codes) + assert_eq(pmi, mi) + levels = [['a', 'b'], ['c', 'd']] + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels, codes) + assert_eq(pmi, mi) + + +def test_multiindex_df_assignment(): + pdf = pd.DataFrame({'x': [1, 2, 3]}) + gdf = cudf.from_pandas(pdf) + pdf.index = pd.MultiIndex([['a', 'b'], ['c', 'd']], + [[0, 1, 0], [1, 0, 1]]) + gdf.index = cudf.MultiIndex(levels=[['a', 'b'], ['c', 'd']], + codes=[[0, 1, 0], [1, 0, 1]]) + assert_eq(pdf, gdf) + + +def test_multiindex_series_assignment(): + ps = pd.Series([1, 2, 3]) + gs = cudf.from_pandas(ps) + ps.index = pd.MultiIndex([['a', 'b'], ['c', 'd']], + [[0, 1, 0], [1, 0, 1]]) + gs.index = cudf.MultiIndex(levels=[['a', 'b'], ['c', 'd']], + codes=[[0, 1, 0], [1, 0, 1]]) + assert_eq(ps, gs) + + +def test_string_index(): + from cudf.dataframe.index import StringIndex, StringColumn + pdf = pd.DataFrame(np.random.rand(5, 5)) + gdf = cudf.from_pandas(pdf) + stringIndex = ['a', 'b', 'c', 'd', 'e'] + pdf.index = stringIndex + gdf.index = stringIndex + assert_eq(pdf, gdf) + stringIndex = np.array(['a', 'b', 'c', 'd', 'e']) + pdf.index = stringIndex + gdf.index = stringIndex + assert_eq(pdf, gdf) + stringIndex = StringIndex(['a', 'b', 'c', 'd', 'e'], name='name') + pdf.index = stringIndex + gdf.index = stringIndex + assert_eq(pdf, gdf) + stringIndex = StringColumn(['a', 'b', 'c', 'd', 'e'], name='name') + pdf.index = stringIndex + gdf.index = stringIndex + assert_eq(pdf, gdf) + + +def test_multiindex_row_shape(): + pdf = pd.DataFrame(np.random.rand(0, 5)) + gdf = cudf.from_pandas(pdf) + pdfIndex = pd.MultiIndex([['a', 'b', 'c']], + [[0]]) + pdfIndex.names = ['alpha'] + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex, gdfIndex) + with pytest.raises(ValueError): + pdf.index = pdfIndex + with pytest.raises(ValueError): + gdf.index = gdfIndex + + +@pytest.fixture +def pdf(): + return pd.DataFrame(np.random.rand(7, 5)) + + +@pytest.fixture +def gdf(pdf): + return cudf.from_pandas(pdf) + + +@pytest.fixture +def pdfIndex(): + pdfIndex = pd.MultiIndex([['a', 'b', 'c'], + ['house', 'store', 'forest'], + ['clouds', 'clear', 'storm'], + ['fire', 'smoke', 'clear']], + [[0, 0, 0, 0, 1, 1, 2], + [1, 1, 1, 1, 0, 0, 2], + [0, 0, 2, 2, 2, 0, 1], + [0, 0, 0, 1, 2, 0, 1]]) + pdfIndex.names = ['alpha', 'location', 'weather', 'sign'] + return pdfIndex + + +def test_from_pandas(pdf, pdfIndex): + pdf.index = pdfIndex + gdf = cudf.from_pandas(pdf) + assert_eq(pdf, gdf) + + +def test_series_multiindex(pdfIndex): + ps = pd.Series(np.random.rand(7)) + gs = cudf.from_pandas(ps) + ps.index = pdfIndex + gs.index = cudf.from_pandas(pdfIndex) + assert_eq(ps, gs) + + +def test_multiindex_take(pdf, gdf, pdfIndex): + gdfIndex = cudf.from_pandas(pdfIndex) + pdf.index = pdfIndex + gdf.index = gdfIndex + assert_eq(pdf.index.take([0]), gdf.index.take([0])) + assert_eq(pdf.index.take(np.array([0])), gdf.index.take(np.array([0]))) + from cudf import Series + assert_eq(pdf.index.take(Series([0])), gdf.index.take(Series([0]))) + assert_eq(pdf.index.take([0, 1]), gdf.index.take([0, 1])) + assert_eq(pdf.index.take(np.array([0, 1])), + gdf.index.take(np.array([0, 1]))) + assert_eq(pdf.index.take(Series([0, 1])), gdf.index.take(Series([0, 1]))) + + +def test_multiindex_getitem(pdf, gdf, pdfIndex): + gdfIndex = cudf.from_pandas(pdfIndex) + pdf.index = pdfIndex + gdf.index = gdfIndex + assert_eq(pdf.index[0], gdf.index[0]) + + +def test_multiindex_loc(pdf, gdf, pdfIndex): + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex, gdfIndex) + pdf.index = pdfIndex + gdf.index = gdfIndex + # return 2 rows, 0 remaining keys = dataframe with entire index + assert_eq(pdf.loc[('a', 'store', 'clouds', 'fire')], + gdf.loc[('a', 'store', 'clouds', 'fire')]) + # return 2 rows, 1 remaining key = dataframe with n-k index columns + assert_eq(pdf.loc[('a', 'store', 'storm')], + gdf.loc[('a', 'store', 'storm')]) + # return 2 rows, 2 remaining keys = dataframe with n-k index columns + assert_eq(pdf.loc[('a', 'store')], + gdf.loc[('a', 'store')]) + assert_eq(pdf.loc[('b', 'house')], + gdf.loc[('b', 'house')]) + # return 2 rows, n-1 remaining keys = dataframe with n-k index columns + assert_eq(pdf.loc[('a',)], + gdf.loc[('a',)]) + # return 1 row, 0 remaining keys = dataframe with entire index + assert_eq(pdf.loc[('a', 'store', 'storm', 'smoke')], + gdf.loc[('a', 'store', 'storm', 'smoke')]) + # return 1 row and 1 remaining key = series + assert_eq(pdf.loc[('c', 'forest', 'clear')], + gdf.loc[('c', 'forest', 'clear')]) + + +@pytest.mark.xfail(reason="Slicing MultiIndexes not supported yet", + raises=AttributeError) +def test_multiindex_loc_slice(pdf, gdf, pdfIndex): + gdf = cudf.from_pandas(pdf) + gdfIndex = cudf.from_pandas(pdfIndex) + pdf.index = pdfIndex + gdf.index = gdfIndex + assert_eq(pdf.loc[('a', 'store'): ('b', 'house')], + gdf.loc[('a', 'store'): ('b', 'house')]) + + +def test_multiindex_loc_then_column(pdf, gdf, pdfIndex): + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex, gdfIndex) + pdf.index = pdfIndex + gdf.index = gdfIndex + assert_eq(pdf.loc[('a', 'store', 'clouds', 'fire')][0], + gdf.loc[('a', 'store', 'clouds', 'fire')][0]) + + +def test_multiindex_loc_rows_0(pdf, gdf, pdfIndex): + gdfIndex = cudf.from_pandas(pdfIndex) + pdf.index = pdfIndex + gdf.index = gdfIndex + with pytest.raises(KeyError): + print(pdf.loc[('d',)]) + with pytest.raises(KeyError): + print(gdf.loc[('d',)]) + assert_eq(pdf, gdf) + + +def test_multiindex_loc_rows_1_2_key(pdf, gdf, pdfIndex): + gdfIndex = cudf.from_pandas(pdfIndex) + pdf.index = pdfIndex + gdf.index = gdfIndex + print(pdf.loc[('c', 'forest')]) + print(gdf.loc[('c', 'forest')].to_pandas()) + assert_eq(pdf.loc[('c', 'forest')], gdf.loc[('c', 'forest')]) + + +def test_multiindex_loc_rows_1_1_key(pdf, gdf, pdfIndex): + gdfIndex = cudf.from_pandas(pdfIndex) + pdf.index = pdfIndex + gdf.index = gdfIndex + print(pdf.loc[('c',)]) + print(gdf.loc[('c',)].to_pandas()) + assert_eq(pdf.loc[('c',)], gdf.loc[('c',)]) + + +def test_multiindex_column_shape(): + pdf = pd.DataFrame(np.random.rand(5, 0)) + gdf = cudf.from_pandas(pdf) + pdfIndex = pd.MultiIndex([['a', 'b', 'c']], + [[0]]) + pdfIndex.names = ['alpha'] + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex, gdfIndex) + with pytest.raises(ValueError): + pdf.columns = pdfIndex + with pytest.raises(ValueError): + gdf.columns = gdfIndex + + +def test_multiindex_columns(pdf, gdf, pdfIndex): + pdf = pdf.T + gdf = cudf.from_pandas(pdf) + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex, gdfIndex) + pdf.columns = pdfIndex + gdf.columns = gdfIndex + assert_eq(pdf[('a', 'store', 'clouds', 'fire')], + gdf[('a', 'store', 'clouds', 'fire')]) + assert_eq(pdf[('a', 'store', 'storm', 'smoke')], + gdf[('a', 'store', 'storm', 'smoke')]) + assert_eq(pdf[('a', 'store')], + gdf[('a', 'store')]) + assert_eq(pdf[('b', 'house')], + gdf[('b', 'house')]) + assert_eq(pdf[('a', 'store', 'storm')], + gdf[('a', 'store', 'storm')]) + assert_eq(pdf[('a',)], + gdf[('a',)]) + assert_eq(pdf[('c', 'forest', 'clear')], + gdf[('c', 'forest', 'clear')]) + +@pytest.mark.xfail(reason="Slicing MultiIndexes not supported yet", + raises=TypeError) +def test_multiindex_column_slice(pdf, gdf, pdfIndex): + pdf = pdf.T + gdf = cudf.from_pandas(pdf) + gdfIndex = cudf.from_pandas(pdfIndex) + pdf.columns = pdfIndex + gdf.columns = gdfIndex + assert_eq(pdf[('a', 'store'): ('b', 'house')], + gdf[('a', 'store'): ('b', 'house')]) + + +def test_multiindex_from_tuples(): + arrays = [['a', 'a', 'b', 'b'], + ['house', 'store', 'house', 'store']] + tuples = list(zip(*arrays)) + pmi = pd.MultiIndex.from_tuples(tuples) + gmi = cudf.MultiIndex.from_tuples(tuples) + assert_eq(pmi, gmi) + + +def test_multiindex_from_dataframe(): + if not hasattr(pd.MultiIndex([[]], [[]]), 'codes'): + pytest.skip() + pdf = pd.DataFrame([['a', 'house'], ['a', 'store'], + ['b', 'house'], ['b', 'store']]) + gdf = cudf.from_pandas(pdf) + pmi = pd.MultiIndex.from_frame(pdf, names=['alpha', 'location']) + gmi = cudf.MultiIndex.from_frame(gdf, names=['alpha', 'location']) + assert_eq(pmi, gmi) + + +def test_multiindex_from_product(): + arrays = [['a', 'a', 'b', 'b'], + ['house', 'store', 'house', 'store']] + pmi = pd.MultiIndex.from_product(arrays, names=['alpha', 'location']) + gmi = cudf.MultiIndex.from_product(arrays, names=['alpha', 'location']) + assert_eq(pmi, gmi) + + +def test_multiindex_index_and_columns(): + gdf = cudf.DataFrame() + gdf['x'] = np.random.randint(0, 5, 5) + gdf['y'] = np.random.randint(0, 5, 5) + pdf = gdf.to_pandas() + mi = cudf.MultiIndex(levels=[[0, 1, 2], [3, 4]], codes=[[0, 0, 1, 1, 2], + [0, 1, 0, 1, 1]], names=['x', 'y']) + gdf.index = mi + mc = cudf.MultiIndex(levels=[['val'], ['mean', 'min']], + codes=[[0, 0], [0, 1]]) + gdf.columns = mc + pdf.index = mi + pdf.index.names = ['x', 'y'] + pdf.columns = mc + assert_eq(pdf, gdf) + + +def test_multiindex_multiple_groupby(): + pdf = pd.DataFrame( + { + "a": [4, 17, 4, 9, 5], + "b": [1, 4, 4, 3, 2], + "x": np.random.normal(size=5), + } + ) + gdf = cudf.DataFrame.from_pandas(pdf) + pdg = pdf.groupby(['a', 'b']).sum() + gdg = gdf.groupby(['a', 'b']).sum() + assert_eq(pdg, gdg) + pdg = pdf.groupby(['a', 'b']).x.sum() + gdg = gdf.groupby(['a', 'b']).x.sum() + assert_eq(pdg, gdg) + + +@pytest.mark.parametrize( + "func", + [ + lambda df: df.groupby(["x", "y"]).z.sum(), + lambda df: df.groupby(["x", "y"]).sum(), + ], +) +def test_multi_column(func): + pdf = pd.DataFrame( + { + "x": np.random.randint(0, 5, size=1000), + "y": np.random.randint(0, 10, size=1000), + "z": np.random.normal(size=1000), + } + ) + gdf = cudf.DataFrame.from_pandas(pdf) + + a = func(pdf) + b = func(gdf) + + assert_eq(a, b) diff --git a/python/cudf/tests/test_string.py b/python/cudf/tests/test_string.py index e9419fa1822..ae9f6a91c2e 100644 --- a/python/cudf/tests/test_string.py +++ b/python/cudf/tests/test_string.py @@ -10,6 +10,7 @@ from cudf import concat from cudf.dataframe import DataFrame, Series +from cudf.dataframe.index import StringIndex, StringColumn from cudf.bindings.GDFError import GDFError from cudf.tests.utils import assert_eq from librmm_cffi import librmm as rmm @@ -753,13 +754,9 @@ def test_string_groupby_key_index(): gdf['b'] = other_data expect = pdf.groupby('a').count() - with pytest.raises( - NotImplementedError, - match="Strings are not yet supported in the index" - ): - got = gdf.groupby('a').count() + got = gdf.groupby('a').count() - assert_eq(expect, got) + assert_eq(expect, got) @pytest.mark.parametrize('scalar', [ @@ -776,3 +773,24 @@ def test_string_set_scalar(scalar): assert_eq(pdf['b'], gdf['b']) assert_eq(pdf, gdf) + + +def test_string_index(): + pdf = pd.DataFrame(np.random.rand(5, 5)) + gdf = DataFrame.from_pandas(pdf) + stringIndex = ['a', 'b', 'c', 'd', 'e'] + pdf.index = stringIndex + gdf.index = stringIndex + assert_eq(pdf, gdf) + stringIndex = np.array(['a', 'b', 'c', 'd', 'e']) + pdf.index = stringIndex + gdf.index = stringIndex + assert_eq(pdf, gdf) + stringIndex = StringIndex(['a', 'b', 'c', 'd', 'e'], name='name') + pdf.index = stringIndex + gdf.index = stringIndex + assert_eq(pdf, gdf) + stringIndex = StringColumn(['a', 'b', 'c', 'd', 'e'], name='name') + pdf.index = stringIndex + gdf.index = stringIndex + assert_eq(pdf, gdf) diff --git a/python/cudf/tests/utils.py b/python/cudf/tests/utils.py index 40ebda6a085..e3ef61d8cca 100644 --- a/python/cudf/tests/utils.py +++ b/python/cudf/tests/utils.py @@ -49,7 +49,7 @@ def assert_eq(a, b, **kwargs): tm.assert_frame_equal(a, b, **kwargs) elif isinstance(a, pd.Series): tm.assert_series_equal(a, b, **kwargs) - elif isinstance(a, pd.Index): + elif isinstance(a, (pd.Index, pd.MultiIndex)): tm.assert_index_equal(a, b, **kwargs) elif isinstance(a, np.ndarray) and isinstance(b, np.ndarray): assert np.allclose(a, b, equal_nan=True)