Skip to content

Commit

Permalink
Avoid index-to-column conversion in some DataFrame ops (rapidsai#15763)
Browse files Browse the repository at this point in the history
xref rapidsai#15494

* For `Index.str`, check the `dtype` instead of the underlying column type (which would materialize RangeIndex)
* For `set_index`, don't immediately convert passed objects to column until necessary
* For `_make_operands_and_index_for_binop`, don't create pandas object more than once

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: rapidsai#15763
  • Loading branch information
mroeschke authored May 21, 2024
1 parent 60d5717 commit d78d565
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 69 deletions.
109 changes: 41 additions & 68 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2047,29 +2047,24 @@ def _make_operands_and_index_for_binop(
equal_columns = True
elif isinstance(other, Series):
if (
not can_reindex
and fn in cudf.utils.utils._EQUALITY_OPS
and (
not self._data.to_pandas_index().equals(
other.index.to_pandas()
)
not (self_pd_columns := self._data.to_pandas_index()).equals(
other_pd_index := other.index.to_pandas()
)
and not can_reindex
and fn in cudf.utils.utils._EQUALITY_OPS
):
raise ValueError(
"Can only compare DataFrame & Series objects "
"whose columns & index are same respectively, "
"please reindex."
)
rhs = dict(zip(other.index.to_pandas(), other.values_host))
rhs = dict(zip(other_pd_index, other.values_host))
# For keys in right but not left, perform binops between NaN (not
# NULL!) and the right value (result is NaN).
left_default = as_column(np.nan, length=len(self))
equal_columns = other.index.to_pandas().equals(
self._data.to_pandas_index()
)
equal_columns = other_pd_index.equals(self_pd_columns)
can_use_self_column_name = (
equal_columns
or list(other._index._data.names) == self._data._level_names
equal_columns or other_pd_index.names == self_pd_columns.names
)
elif isinstance(other, DataFrame):
if (
Expand Down Expand Up @@ -2952,82 +2947,60 @@ def set_index(

if not isinstance(keys, list):
keys = [keys]
if len(keys) == 0:
raise ValueError("No valid columns to be added to index.")
if append:
keys = [self.index] + keys

# Preliminary type check
col_not_found = []
columns_to_add = []
labels_not_found = []
data_to_add = []
names = []
to_drop = []
for col in keys:
# Is column label
# label-like
if is_scalar(col) or isinstance(col, tuple):
if col in self._column_names:
columns_to_add.append(self[col])
data_to_add.append(self[col])
names.append(col)
if drop:
to_drop.append(col)
else:
col_not_found.append(col)
labels_not_found.append(col)
# index-like
elif isinstance(col, (MultiIndex, pd.MultiIndex)):
if isinstance(col, pd.MultiIndex):
col = MultiIndex.from_pandas(col)
data_to_add.extend(col._data.columns)
names.extend(col.names)
elif isinstance(
col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
):
data_to_add.append(col)
names.append(col.name)
else:
# Try coerce into column
if not is_column_like(col):
try:
col = as_column(col)
except TypeError:
msg = f"{col} cannot be converted to column-like."
raise TypeError(msg)
if isinstance(col, (MultiIndex, pd.MultiIndex)):
col = (
cudf.from_pandas(col)
if isinstance(col, pd.MultiIndex)
else col
)
cols = [col._data[x] for x in col._data]
columns_to_add.extend(cols)
names.extend(col.names)
else:
if isinstance(col, (pd.RangeIndex, cudf.RangeIndex)):
# Corner case: RangeIndex does not need to instantiate
columns_to_add.append(col)
else:
# For pandas obj, convert to gpu obj
columns_to_add.append(as_column(col))
if isinstance(
col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
):
names.append(col.name)
else:
names.append(None)

if col_not_found:
raise KeyError(f"None of {col_not_found} are in the columns")
try:
col = as_column(col)
except TypeError as err:
msg = f"{col} cannot be converted to column-like."
raise TypeError(msg) from err
data_to_add.append(col)
names.append(None)

if append:
idx_cols = [self.index._data[x] for x in self.index._data]
if isinstance(self.index, MultiIndex):
idx_names = self.index.names
else:
idx_names = [self.index.name]
columns_to_add = idx_cols + columns_to_add
names = idx_names + names
if labels_not_found:
raise KeyError(f"None of {labels_not_found} are in the columns")

if len(columns_to_add) == 0:
raise ValueError("No valid columns to be added to index.")
elif (
len(columns_to_add) == 1
if (
len(data_to_add) == 1
and len(keys) == 1
and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex))
):
idx = cudf.Index(columns_to_add[0], name=names[0])
# Don't turn single level MultiIndex into an Index
idx = cudf.Index(data_to_add[0], name=names[0])
else:
idx = MultiIndex._from_data(
{i: col for i, col in enumerate(columns_to_add)}
)
idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
idx.names = names

if not isinstance(idx, BaseIndex):
raise ValueError("Parameter index should be type `Index`.")

df = self if inplace else self.copy(deep=True)

if verify_integrity and not idx.is_unique:
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
is_integer,
is_list_like,
is_scalar,
is_string_dtype,
)
from cudf.core._base_index import BaseIndex, _return_get_indexer_result
from cudf.core._compat import PANDAS_LT_300
Expand Down Expand Up @@ -1623,7 +1624,7 @@ def _indices_of(self, value):
@property
@_cudf_nvtx_annotate
def str(self):
if isinstance(self._values, cudf.core.column.StringColumn):
if is_string_dtype(self.dtype):
return StringMethods(parent=self)
else:
raise AttributeError(
Expand Down

0 comments on commit d78d565

Please sign in to comment.