From a83ac6f27254b2ebf99397d81b776c74f93469bf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 3 Sep 2024 10:07:49 -1000 Subject: [PATCH] Add return type annotations to MultiIndex (#16696) Mostly just return type annotations. No logic changes. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/16696 --- docs/cudf/source/conf.py | 2 + python/cudf/cudf/core/multiindex.py | 109 ++++++++++++++++------------ 2 files changed, 63 insertions(+), 48 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c58bc42327c..95813907bf4 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -566,6 +566,8 @@ def on_missing_reference(app, env, node, contnode): ("py:obj", "cudf.Index.to_flat_index"), ("py:obj", "cudf.MultiIndex.to_flat_index"), ("py:meth", "pyarrow.Table.to_pandas"), + ("py:class", "pd.DataFrame"), + ("py:class", "pandas.core.indexes.frozen.FrozenList"), ("py:class", "pa.Array"), ("py:class", "ScalarLike"), ("py:class", "ParentType"), diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a66e2936e3b..e00890ac5c3 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -247,7 +247,7 @@ def to_series(self, index=None, name=None): ) @_performance_tracking - def astype(self, dtype, copy: bool = True): + def astype(self, dtype, copy: bool = True) -> Self: if not is_object_dtype(dtype): raise TypeError( "Setting a MultiIndex dtype to anything other than object is " @@ -256,7 +256,7 @@ def astype(self, dtype, copy: bool = True): return self @_performance_tracking - def rename(self, names, inplace=False): + def rename(self, names, inplace: bool = False) -> Self | None: """ Alter MultiIndex level names @@ -303,7 +303,9 @@ def rename(self, names, inplace=False): return self.set_names(names, level=None, inplace=inplace) @_performance_tracking - def set_names(self, names, level=None, inplace=False): + def set_names( + self, names, level=None, inplace: bool = False + ) -> Self | None: names_is_list_like = is_list_like(names) level_is_list_like = is_list_like(level) @@ -345,7 +347,7 @@ def _from_data( cls, data: MutableMapping, name: Any = None, - ) -> MultiIndex: + ) -> Self: """ Use when you have a ColumnAccessor-like mapping but no codes and levels. """ @@ -394,7 +396,7 @@ def copy( names=None, deep=False, name=None, - ): + ) -> Self: """Returns copy of MultiIndex object. Returns a copy of `MultiIndex`. The `levels` and `codes` value can be @@ -457,7 +459,7 @@ def copy( ) @_performance_tracking - def __repr__(self): + def __repr__(self) -> str: max_seq_items = pd.get_option("display.max_seq_items") or len(self) if len(self) > max_seq_items: @@ -503,7 +505,7 @@ def __repr__(self): @property # type: ignore @_external_only_api("Use ._codes instead") @_performance_tracking - def codes(self): + def codes(self) -> pd.core.indexes.frozen.FrozenList: """ Returns the codes of the underlying MultiIndex. @@ -531,7 +533,7 @@ def get_slice_bound(self, label, side): @property # type: ignore @_performance_tracking - def nlevels(self): + def nlevels(self) -> int: """Integer number of levels in this MultiIndex.""" return self._num_columns @@ -590,7 +592,7 @@ def _get_level_label(self, level): return self.names[level] @_performance_tracking - def isin(self, values, level=None): + def isin(self, values, level=None) -> cp.ndarray: """Return a boolean array where the index values are in values. Compute boolean array of whether each index value is found in @@ -864,7 +866,7 @@ def _validate_indexer( | slice | tuple[Any, ...] | list[tuple[Any, ...]], - ): + ) -> None: if isinstance(indexer, numbers.Number): return if isinstance(indexer, tuple): @@ -900,12 +902,12 @@ def __eq__(self, other): @property # type: ignore @_performance_tracking - def size(self): + def size(self) -> int: # The size of a MultiIndex is only dependent on the number of rows. return self._num_rows @_performance_tracking - def take(self, indices): + def take(self, indices) -> Self: if isinstance(indices, cudf.Series) and indices.has_nulls: raise ValueError("Column must have no nulls.") obj = super().take(indices) @@ -957,7 +959,12 @@ def __getitem__(self, index): return result @_performance_tracking - def to_frame(self, index=True, name=no_default, allow_duplicates=False): + def to_frame( + self, + index: bool = True, + name=no_default, + allow_duplicates: bool = False, + ) -> cudf.DataFrame: """ Create a DataFrame with the levels of the MultiIndex as columns. @@ -1034,7 +1041,7 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False): ) @_performance_tracking - def get_level_values(self, level): + def get_level_values(self, level) -> cudf.Index: """ Return the values at the requested level @@ -1067,30 +1074,30 @@ def get_level_values(self, level): ) return level_values - def _is_numeric(self): + def _is_numeric(self) -> bool: return False - def _is_boolean(self): + def _is_boolean(self) -> bool: return False - def _is_integer(self): + def _is_integer(self) -> bool: return False - def _is_floating(self): + def _is_floating(self) -> bool: return False - def _is_object(self): + def _is_object(self) -> bool: return False - def _is_categorical(self): + def _is_categorical(self) -> bool: return False - def _is_interval(self): + def _is_interval(self) -> bool: return False @classmethod @_performance_tracking - def _concat(cls, objs): + def _concat(cls, objs) -> Self: source_data = [o.to_frame(index=False) for o in objs] # TODO: Verify if this is really necessary or if we can rely on @@ -1100,17 +1107,19 @@ def _concat(cls, objs): for obj in source_data[1:]: obj.columns = colnames - source_data = cudf.DataFrame._concat(source_data) + source_df = cudf.DataFrame._concat(source_data) try: # Only set names if all objs have the same names (names,) = {o.names for o in objs} - {None} except ValueError: - names = [None] * source_data._num_columns - return cudf.MultiIndex.from_frame(source_data, names=names) + names = [None] * source_df._num_columns + return cudf.MultiIndex.from_frame(source_df, names=names) @classmethod @_performance_tracking - def from_tuples(cls, tuples, sortorder: int | None = None, names=None): + def from_tuples( + cls, tuples, sortorder: int | None = None, names=None + ) -> Self: """ Convert list of tuples to MultiIndex. @@ -1153,7 +1162,7 @@ def from_tuples(cls, tuples, sortorder: int | None = None, names=None): return cls.from_pandas(pdi) @_performance_tracking - def to_numpy(self): + def to_numpy(self) -> np.ndarray: return self.values_host def to_flat_index(self): @@ -1167,7 +1176,7 @@ def to_flat_index(self): @property # type: ignore @_performance_tracking - def values_host(self): + def values_host(self) -> np.ndarray: """ Return a numpy representation of the MultiIndex. @@ -1195,7 +1204,7 @@ def values_host(self): @property # type: ignore @_performance_tracking - def values(self): + def values(self) -> cp.ndarray: """ Return a CuPy representation of the MultiIndex. @@ -1236,7 +1245,7 @@ def from_frame( df: pd.DataFrame | cudf.DataFrame, sortorder: int | None = None, names=None, - ): + ) -> Self: """ Make a MultiIndex from a DataFrame. @@ -1303,7 +1312,9 @@ def from_frame( @classmethod @_performance_tracking - def from_product(cls, iterables, sortorder: int | None = None, names=None): + def from_product( + cls, iterables, sortorder: int | None = None, names=None + ) -> Self: """ Make a MultiIndex from the cartesian product of multiple iterables. @@ -1355,7 +1366,7 @@ def from_arrays( arrays, sortorder=None, names=None, - ) -> MultiIndex: + ) -> Self: """ Convert arrays to MultiIndex. @@ -1410,7 +1421,7 @@ def from_arrays( ) @_performance_tracking - def _poplevels(self, level): + def _poplevels(self, level) -> None | MultiIndex | cudf.Index: """ Remove and return the specified levels from self. @@ -1461,7 +1472,7 @@ def _poplevels(self, level): return popped @_performance_tracking - def swaplevel(self, i=-2, j=-1): + def swaplevel(self, i=-2, j=-1) -> Self: """ Swap level i with level j. Calling this method does not change the ordering of the values. @@ -1512,7 +1523,7 @@ def swaplevel(self, i=-2, j=-1): return midx @_performance_tracking - def droplevel(self, level=-1): + def droplevel(self, level=-1) -> MultiIndex | cudf.Index: """ Removes the specified levels from the MultiIndex. @@ -1598,7 +1609,9 @@ def to_pandas( @classmethod @_performance_tracking - def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default): + def from_pandas( + cls, multiindex: pd.MultiIndex, nan_as_null=no_default + ) -> Self: """ Convert from a Pandas MultiIndex @@ -1633,11 +1646,11 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default): @cached_property # type: ignore @_performance_tracking - def is_unique(self): + def is_unique(self) -> bool: return len(self) == len(self.unique()) @property - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype("O") @_performance_tracking @@ -1706,7 +1719,7 @@ def is_monotonic_decreasing(self) -> bool: ) @_performance_tracking - def fillna(self, value): + def fillna(self, value) -> Self: """ Fill null values with the specified value. @@ -1758,7 +1771,7 @@ def nunique(self, dropna: bool = True) -> int: mi = self.dropna(how="all") if dropna else self return len(mi.unique()) - def _clean_nulls_from_index(self): + def _clean_nulls_from_index(self) -> Self: """ Convert all na values(if any) in MultiIndex object to `` as a preprocessing step to `__repr__` methods. @@ -1769,20 +1782,20 @@ def _clean_nulls_from_index(self): ) @_performance_tracking - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: usage = sum(col.memory_usage for col in self._columns) usage += sum(level.memory_usage(deep=deep) for level in self._levels) usage += sum(code.memory_usage for code in self._codes) return usage @_performance_tracking - def difference(self, other, sort=None): + def difference(self, other, sort=None) -> Self: if hasattr(other, "to_pandas"): other = other.to_pandas() return cudf.from_pandas(self.to_pandas().difference(other, sort)) @_performance_tracking - def append(self, other): + def append(self, other) -> Self: """ Append a collection of MultiIndex objects together @@ -2000,7 +2013,7 @@ def get_loc(self, key): mask[true_inds] = True return mask - def _get_reconciled_name_object(self, other) -> MultiIndex: + def _get_reconciled_name_object(self, other) -> Self: """ If the result of a set operation will be self, return self, unless the names change, in which @@ -2026,7 +2039,7 @@ def _maybe_match_names(self, other): ] @_performance_tracking - def union(self, other, sort=None): + def union(self, other, sort=None) -> Self: if not isinstance(other, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" try: @@ -2050,7 +2063,7 @@ def union(self, other, sort=None): return self._union(other, sort=sort) @_performance_tracking - def _union(self, other, sort=None): + def _union(self, other, sort=None) -> Self: # TODO: When to_frame is refactored to return a # deep copy in future, we should push most of the common # logic between MultiIndex._union & BaseIndex._union into @@ -2076,7 +2089,7 @@ def _union(self, other, sort=None): return midx @_performance_tracking - def _intersection(self, other, sort=None): + def _intersection(self, other, sort=None) -> Self: if self.names != other.names: deep = True col_names = list(range(0, self.nlevels)) @@ -2167,7 +2180,7 @@ def _columns_for_reset_index( else: yield from self._split_columns_by_levels(levels, in_levels=True) - def repeat(self, repeats, axis=None): + def repeat(self, repeats, axis=None) -> Self: return self._from_data( self._data._from_columns_like_self( super()._repeat([*self._columns], repeats, axis)