Skip to content

Commit

Permalink
Implements accessor for categorical types
Browse files Browse the repository at this point in the history
  • Loading branch information
wjsi committed Sep 25, 2023
1 parent 0a42ba8 commit c6b1725
Show file tree
Hide file tree
Showing 12 changed files with 329 additions and 76 deletions.
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,13 @@
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
"dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
"matplotlib": ("https://matplotlib.org/", None),
"matplotlib": ("https://matplotlib.org/stable/", None),
"numpy": ("https://numpy.org/doc/stable/", None),
"pandas": ("https://pandas.pydata.org/docs/", None),
"pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None),
"py": ("https://pylib.readthedocs.io/en/latest/", None),
"python": ("https://docs.python.org/3/", None),
"scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
"scipy": ("https://docs.scipy.org/doc/scipy/", None),
"statsmodels": ("https://www.statsmodels.org/devel/", None),
"pyarrow": ("https://arrow.apache.org/docs/", None),
}
Expand Down
25 changes: 25 additions & 0 deletions docs/source/reference/dataframe/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,31 @@ strings and apply several methods to it. These can be accessed like
Series.str
Series.dt


.. _generated.series.cat:

Categorical accessor
~~~~~~~~~~~~~~~

Categorical-dtype specific methods and attributes are available under
the ``Series.cat`` accessor.

.. autosummary::
:toctree: generated/
:template: accessor_method.rst

Series.cat.categories
Series.cat.ordered
Series.cat.codes
Series.cat.rename_categories
Series.cat.reorder_categories
Series.cat.add_categories
Series.cat.remove_categories
Series.cat.set_categories
Series.cat.as_ordered
Series.cat.as_unordered


Plotting
--------
``Series.plot`` is both a callable method and a namespace attribute for
Expand Down
1 change: 1 addition & 0 deletions mars/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# noinspection PyUnresolvedReferences
from ..typing import ChunkType, TileableType, EntityType, OperandType
from .base import ExecutionError
from .context import get_context
from .entity import (
Entity,
EntityData,
Expand Down
22 changes: 11 additions & 11 deletions mars/dataframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,32 +45,32 @@

from . import arithmetic
from . import base
from . import datastore
from . import groupby
from . import indexing
from . import merge as merge_
from . import missing
from . import plotting
from . import reduction
from . import statistics
from . import sort
from . import groupby
from . import statistics
from . import ufunc
from . import datastore
from . import window
from . import plotting

del (
reduction,
statistics,
arithmetic,
indexing,
merge_,
base,
datastore,
groupby,
indexing,
merge_,
missing,
ufunc,
datastore,
plotting,
reduction,
sort,
statistics,
ufunc,
window,
plotting,
)
del DataFrameFetch, DataFrameFetchShuffle

Expand Down
15 changes: 13 additions & 2 deletions mars/dataframe/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,15 @@
def _install():
from ..core import DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE
from .standardize_range_index import ChunkStandardizeRangeIndex
from .categorical import _categorical_method_to_handlers
from .string_ import _string_method_to_handlers
from .datetimes import _datetime_method_to_handlers
from .accessor import StringAccessor, DatetimeAccessor, CachedAccessor
from .accessor import (
CachedAccessor,
CategoricalAccessor,
DatetimeAccessor,
StringAccessor,
)

for t in DATAFRAME_TYPE:
setattr(t, "to_gpu", to_gpu)
Expand Down Expand Up @@ -134,6 +140,10 @@ def _install():
setattr(t, "is_monotonic_increasing", property(fget=is_monotonic_increasing))
setattr(t, "is_monotonic_decreasing", property(fget=is_monotonic_decreasing))

for method in _categorical_method_to_handlers:
if not hasattr(CategoricalAccessor, method):
CategoricalAccessor._register(method)

for method in _string_method_to_handlers:
if not hasattr(StringAccessor, method):
StringAccessor._register(method)
Expand All @@ -143,8 +153,9 @@ def _install():
DatetimeAccessor._register(method)

for series in SERIES_TYPE:
series.str = CachedAccessor("str", StringAccessor)
series.cat = CachedAccessor("cat", CategoricalAccessor)
series.dt = CachedAccessor("dt", DatetimeAccessor)
series.str = CachedAccessor("str", StringAccessor)


_install()
Expand Down
54 changes: 52 additions & 2 deletions mars/dataframe/base/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,20 @@
from functools import wraps
from typing import Iterable

import numpy as np
import pandas as pd
from pandas.api.types import (
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_timedelta64_dtype,
is_period_dtype,
is_timedelta64_dtype,
)

from ...utils import adapt_mars_docstring
from .string_ import _string_method_to_handlers, SeriesStringMethod
from .categorical import _categorical_method_to_handlers, SeriesCategoricalMethod
from .datetimes import _datetime_method_to_handlers, SeriesDatetimeMethod
from .string_ import _string_method_to_handlers, SeriesStringMethod


class StringAccessor:
Expand Down Expand Up @@ -262,6 +265,53 @@ def __dir__(self) -> Iterable[str]:
return list(s)


class CategoricalAccessor:
def __init__(self, series):
if not is_categorical_dtype(series.dtype):
raise AttributeError("Can only use .cat accessor with categorical values")
self._series = series

@property
def ordered(self):
return self._series.dtype.ordered

@property
def categories(self):
return getattr(self, "_get_categories")()

@classmethod
def _gen_func(cls, method, is_property):
def _inner(self, *args, **kwargs):
op = SeriesCategoricalMethod(
method=method,
is_property=is_property,
method_args=args,
method_kwargs=kwargs,
)
return op(self._series)

if hasattr(pd.Series.cat, method):
_inner = wraps(getattr(pd.Series.cat, method))(_inner)
_inner.__doc__ = adapt_mars_docstring(
getattr(pd.Series.cat, method).__doc__
)
return _inner

@classmethod
def _register(cls, method):
# non-existing members are considered methods by default
is_property = not callable(getattr(pd.Series.cat, method, lambda: None))
func = cls._gen_func(method, is_property)
if is_property:
func = property(func)
setattr(cls, method, func)

def __dir__(self) -> Iterable[str]:
s = set(super().__dir__())
s.update(_categorical_method_to_handlers.keys())
return list(s)


class CachedAccessor:
def __init__(self, name: str, accessor) -> None:
self._name = name
Expand Down
Loading

0 comments on commit c6b1725

Please sign in to comment.