Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
261 changes: 252 additions & 9 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
is_platform_windows,
)
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
Expand Down Expand Up @@ -169,8 +168,15 @@ def _cast_pointwise_result(self, values) -> ArrayLike:
return result

@classmethod
@doc(ExtensionArray._empty)
def _empty(cls, shape: Shape, dtype: ExtensionDtype) -> Self:
"""
Create an ExtensionArray with the given shape and dtype.

See also
--------
ExtensionDtype.empty
ExtensionDtype.empty is the 'official' public version of this API.
"""
dtype = cast(BaseMaskedDtype, dtype)
values: np.ndarray = np.empty(shape, dtype=dtype.type)
values.fill(dtype._internal_fill_value)
Expand Down Expand Up @@ -252,8 +258,44 @@ def _pad_or_backfill(
new_values = self
return new_values

@doc(ExtensionArray.fillna)
def fillna(self, value, limit: int | None = None, copy: bool = True) -> Self:
"""
Fill NA/NaN values using the specified method.

Parameters
----------
value : scalar, array-like
If a scalar value is passed it is used to fill all missing values.
Alternatively, an array-like "value" can be given. It's expected
that the array-like have the same length as 'self'.
limit : int, default None
The maximum number of entries where NA values will be filled.
copy : bool, default True
Whether to make a copy of the data before filling. If False, then
the original should be modified and no new memory should be allocated.
For ExtensionArray subclasses that cannot do this, it is at the
author's discretion whether to ignore "copy=False" or to raise.

Returns
-------
ExtensionArray
With NA/NaN filled.

See Also
--------
api.extensions.ExtensionArray.dropna : Return ExtensionArray without
NA values.
api.extensions.ExtensionArray.isna : A 1-D array indicating if
each value is missing.

Examples
--------
>>> arr = pd.array([np.nan, np.nan, 2, 3, np.nan, np.nan])
>>> arr.fillna(0)
<IntegerArray>
[0, 0, 2, 3, 0, 0]
Length: 6, dtype: Int64
"""
mask = self._mask
if limit is not None and limit < len(self):
modify = mask.cumsum() > limit
Expand Down Expand Up @@ -548,8 +590,30 @@ def to_numpy(
data = self._data.astype(dtype, copy=copy)
return data

@doc(ExtensionArray.tolist)
def tolist(self) -> list:
"""
Return a list of the values.

These are each a scalar type, which is a Python scalar
(for str, int, float) or a pandas scalar
(for Timestamp/Timedelta/Interval/Period)

Returns
-------
list
Python list of values in array.

See Also
--------
Index.to_list: Return a list of the values in the Index.
Series.to_list: Return a list of the values in the Series.

Examples
--------
>>> arr = pd.array([1, 2, 3])
>>> arr.tolist()
[1, 2, 3]
"""
if self.ndim > 1:
return [x.tolist() for x in self]
dtype = None if self._hasna else self._data.dtype
Expand Down Expand Up @@ -1075,10 +1139,37 @@ def _rank(

return FloatingArray(result, mask=mask)

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
"""
Return boolean ndarray denoting duplicate values.

Parameters
----------
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
- False : Mark all duplicates as ``True``.

Returns
-------
ndarray[bool]
With true in indices where elements are duplicated and false otherwise.

See Also
--------
DataFrame.duplicated : Return boolean Series denoting
duplicate rows.
Series.duplicated : Indicate duplicate Series values.
api.extensions.ExtensionArray.unique : Compute the ExtensionArray
of unique values.

Examples
--------
>>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated()
array([False, True, False, False, True])
"""
values = self._data
mask = self._mask
return algos.duplicated(values, keep=keep, mask=mask)
Expand All @@ -1094,13 +1185,56 @@ def unique(self) -> Self:
uniques, mask = algos.unique_with_mask(self._data, self._mask)
return self._simple_new(uniques, mask)

@doc(ExtensionArray.searchsorted)
def searchsorted(
self,
value: NumpyValueArrayLike | ExtensionArray,
side: Literal["left", "right"] = "left",
sorter: NumpySorter | None = None,
) -> npt.NDArray[np.intp] | np.intp:
"""
Find indices where elements should be inserted to maintain order.

Find the indices into a sorted array `self` (a) such that, if the
corresponding elements in `value` were inserted before the indices,
the order of `self` would be preserved.

Assuming that `self` is sorted:

====== ================================
`side` returned index `i` satisfies
====== ================================
left ``self[i-1] < value <= self[i]``
right ``self[i-1] <= value < self[i]``
====== ================================

Parameters
----------
value : array-like, list or scalar
Value(s) to insert into `self`.
side : {'left', 'right'}, optional
If 'left', the index of the first suitable location found is given.
If 'right', return the last such index. If there is no suitable
index, return either 0 or N (where N is the length of `self`).
sorter : 1-D array-like, optional
Optional array of integer indices that sort array a into ascending
order. They are typically the result of argsort.

Returns
-------
array of ints or int
If value is array-like, array of insertion points.
If value is scalar, a single integer.

See Also
--------
numpy.searchsorted : Similar method from NumPy.

Examples
--------
>>> arr = pd.array([1, 2, 3, 5])
>>> arr.searchsorted([4])
array([3])
"""
if self._hasna:
raise ValueError(
"searchsorted requires array to be sorted, which is impossible "
Expand All @@ -1111,11 +1245,56 @@ def searchsorted(
# Base class searchsorted would cast to object, which is *much* slower.
return self._data.searchsorted(value, side=side, sorter=sorter)

@doc(ExtensionArray.factorize)
def factorize(
self,
use_na_sentinel: bool = True,
) -> tuple[np.ndarray, ExtensionArray]:
"""
Encode the extension array as an enumerated type.

Parameters
----------
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.

.. versionadded:: 1.5.0

Returns
-------
codes : ndarray
An integer NumPy array that's an indexer into the original
ExtensionArray.
uniques : ExtensionArray
An ExtensionArray containing the unique values of `self`.

.. note::

uniques will *not* contain an entry for the NA value of
the ExtensionArray if there are any missing values present
in `self`.

See Also
--------
factorize : Top-level factorize method that dispatches here.

Notes
-----
:meth:`pandas.factorize` offers a `sort` keyword as well.

Examples
--------
>>> idx1 = pd.PeriodIndex(
... ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"],
... freq="M",
... )
>>> arr, idx = idx1.factorize()
>>> arr
array([0, 0, 1, 1, 2, 2])
>>> idx
PeriodIndex(['2014-01', '2014-02', '2014-03'], dtype='period[M]')
"""
arr = self._data
mask = self._mask

Expand Down Expand Up @@ -1148,8 +1327,38 @@ def factorize(

return codes, uniques_ea

@doc(ExtensionArray._values_for_argsort)
def _values_for_argsort(self) -> np.ndarray:
"""
Return values for sorting.

Returns
-------
ndarray
The transformed values should maintain the ordering between values
within the array.

See Also
--------
ExtensionArray.argsort : Return the indices that would sort this array.

Notes
-----
The caller is responsible for *not* modifying these values in-place, so
it is safe for implementers to give views on ``self``.

Functions that use this (e.g. ``ExtensionArray.argsort``) should ignore
entries with missing values in the original array (according to
``self.isna()``). This means that the corresponding entries in the returned
array don't need to be modified to sort correctly.

Examples
--------
In most cases, this is the underlying Numpy array of the ``ExtensionArray``:

>>> arr = pd.array([1, 2, 3])
>>> arr._values_for_argsort()
array([1, 2, 3])
"""
return self._data

def value_counts(self, dropna: bool = True) -> Series:
Expand Down Expand Up @@ -1198,8 +1407,42 @@ def _mode(self, dropna: bool = True) -> Self:
result = type(self)(result, res_mask)
return result[result.argsort()]

@doc(ExtensionArray.equals)
def equals(self, other) -> bool:
"""
Return if another array is equivalent to this array.

Equivalent means that both arrays have the same shape and dtype, and
all values compare equal. Missing values in the same location are
considered equal (in contrast with normal equality).

Parameters
----------
other : ExtensionArray
Array to compare to this Array.

Returns
-------
boolean
Whether the arrays are equivalent.

See Also
--------
numpy.array_equal : Equivalent method for numpy array.
Series.equals : Equivalent method for Series.
DataFrame.equals : Equivalent method for DataFrame.

Examples
--------
>>> arr1 = pd.array([1, 2, np.nan])
>>> arr2 = pd.array([1, 2, np.nan])
>>> arr1.equals(arr2)
True

>>> arr1 = pd.array([1, 3, np.nan])
>>> arr2 = pd.array([1, 2, np.nan])
>>> arr1.equals(arr2)
False
"""
if type(self) != type(other):
return False
if other.dtype != self.dtype:
Expand Down
Loading