diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index b618f33a6e5..c483d459833 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -819,7 +820,17 @@ struct operator_functor { template struct cast { static constexpr auto arity{1}; - template + template ()>* = nullptr> + __device__ inline auto operator()(From f) -> To + { + if constexpr (cuda::std::is_floating_point_v) { + return convert_fixed_to_floating(f); + } else { + return static_cast(f); + } + } + + template ()>* = nullptr> __device__ inline auto operator()(From f) -> decltype(static_cast(f)) { return static_cast(f); diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index b7aadbe14fa..99ca89cc021 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -187,7 +187,11 @@ cudf::size_type distinct_count(column_view const& input, nan_policy nan_handling, rmm::cuda_stream_view stream) { - if (0 == input.size() or input.null_count() == input.size()) { return 0; } + if (0 == input.size()) { return 0; } + + if (input.null_count() == input.size()) { + return static_cast(null_handling == null_policy::INCLUDE); + } auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); diff --git a/pyproject.toml b/pyproject.toml index c602240a0b7..2f59864894b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ quiet-level = 3 line-length = 79 [tool.ruff.lint] -select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH"] +select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"] ignore = [ # whitespace before : "E203", diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index c667286fc16..bcab009c102 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,8 +2,6 @@ from __future__ import annotations -from typing import Dict, Optional, Tuple - from typing_extensions import Self from cudf._typing import Dtype, DtypeObj, ScalarLike @@ -11,27 +9,27 @@ from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase class Column: - _data: Optional[Buffer] - _mask: Optional[Buffer] - _base_data: Optional[Buffer] - _base_mask: Optional[Buffer] + _data: Buffer | None + _mask: Buffer | None + _base_data: Buffer | None + _base_mask: Buffer | None _dtype: DtypeObj _size: int _offset: int _null_count: int - _children: Tuple[ColumnBase, ...] - _base_children: Tuple[ColumnBase, ...] - _distinct_count: Dict[bool, int] + _children: tuple[ColumnBase, ...] + _base_children: tuple[ColumnBase, ...] + _distinct_count: dict[bool, int] def __init__( self, - data: Optional[Buffer], + data: Buffer | None, size: int, dtype: Dtype, - mask: Optional[Buffer] = None, - offset: Optional[int] = None, - null_count: Optional[int] = None, - children: Tuple[ColumnBase, ...] = (), + mask: Buffer | None = None, + offset: int | None = None, + null_count: int | None = None, + children: tuple[ColumnBase, ...] = (), ) -> None: ... @property def base_size(self) -> int: ... @@ -40,9 +38,9 @@ class Column: @property def size(self) -> int: ... @property - def base_data(self) -> Optional[Buffer]: ... + def base_data(self) -> Buffer | None: ... @property - def data(self) -> Optional[Buffer]: ... + def data(self) -> Buffer | None: ... @property def data_ptr(self) -> int: ... def set_base_data(self, value: Buffer) -> None: ... @@ -50,25 +48,25 @@ class Column: def nullable(self) -> bool: ... def has_nulls(self, include_nan: bool = False) -> bool: ... @property - def base_mask(self) -> Optional[Buffer]: ... + def base_mask(self) -> Buffer | None: ... @property - def mask(self) -> Optional[Buffer]: ... + def mask(self) -> Buffer | None: ... @property def mask_ptr(self) -> int: ... - def set_base_mask(self, value: Optional[Buffer]) -> None: ... - def set_mask(self, value: Optional[Buffer]) -> Self: ... + def set_base_mask(self, value: Buffer | None) -> None: ... + def set_mask(self, value: Buffer | None) -> Self: ... @property def null_count(self) -> int: ... @property def offset(self) -> int: ... @property - def base_children(self) -> Tuple[ColumnBase, ...]: ... + def base_children(self) -> tuple[ColumnBase, ...]: ... @property - def children(self) -> Tuple[ColumnBase, ...]: ... - def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... + def children(self) -> tuple[ColumnBase, ...]: ... + def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False - ) -> Optional[Self]: ... + ) -> Self | None: ... # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 42b1524bd76..d97e9c815b6 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -8,7 +8,7 @@ from collections import abc from functools import wraps from inspect import isclass -from typing import List, Union, cast +from typing import cast import cupy as cp import numpy as np @@ -219,7 +219,7 @@ def wrapped_func(obj): def _union_categoricals( - to_union: List[Union[cudf.Series, cudf.CategoricalIndex]], + to_union: list[cudf.Series | cudf.CategoricalIndex], sort_categories: bool = False, ignore_order: bool = False, ): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index b29fc475b29..e71e45e410e 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -5,7 +5,7 @@ import pickle import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Literal, Set, Tuple +from typing import TYPE_CHECKING, Any, Literal import pandas as pd from typing_extensions import Self @@ -44,11 +44,11 @@ class BaseIndex(Serializable): """Base class for all cudf Index types.""" - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _data: ColumnAccessor @property - def _columns(self) -> Tuple[Any, ...]: + def _columns(self) -> tuple[Any, ...]: raise NotImplementedError @cached_property @@ -342,9 +342,9 @@ def deserialize(cls, header, frames): @property def names(self): """ - Returns a tuple containing the name of the Index. + Returns a FrozenList containing the name of the Index. """ - return (self.name,) + return pd.core.indexes.frozen.FrozenList([self.name]) @names.setter def names(self, values): diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index 5cb9f0363e0..393a68dd844 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -1,8 +1,8 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import ast import functools -from typing import List, Tuple from cudf._lib.expressions import ( ASTOperator, @@ -98,9 +98,9 @@ class libcudfASTVisitor(ast.NodeVisitor): The column names used to map the names in an expression. """ - def __init__(self, col_names: Tuple[str]): - self.stack: List[Expression] = [] - self.nodes: List[Expression] = [] + def __init__(self, col_names: tuple[str]): + self.stack: list[Expression] = [] + self.nodes: list[Expression] = [] self.col_names = col_names @property @@ -218,7 +218,7 @@ def visit_Call(self, node): @functools.lru_cache(256) -def parse_expression(expr: str, col_names: Tuple[str]): +def parse_expression(expr: str, col_names: tuple[str]): visitor = libcudfASTVisitor(col_names) visitor.visit(ast.parse(expr)) return visitor diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index f04cae719c2..269fcf3e37f 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -1,20 +1,23 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from __future__ import annotations import os import zoneinfo from functools import lru_cache -from typing import Literal, Tuple +from typing import TYPE_CHECKING, Literal import numpy as np from cudf._lib.timezone import make_timezone_transition_table from cudf.core.column.column import as_column -from cudf.core.column.datetime import DatetimeColumn -from cudf.core.column.timedelta import TimeDeltaColumn + +if TYPE_CHECKING: + from cudf.core.column.datetime import DatetimeColumn + from cudf.core.column.timedelta import TimeDeltaColumn @lru_cache(maxsize=20) -def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +def get_tz_data(zone_name: str) -> tuple[DatetimeColumn, TimeDeltaColumn]: """ Return timezone data (transition times and UTC offsets) for the given IANA time zone. @@ -40,7 +43,7 @@ def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: def _find_and_read_tzfile_tzpath( zone_name: str, -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: for search_path in zoneinfo.TZPATH: if os.path.isfile(os.path.join(search_path, zone_name)): return _read_tzfile_as_columns(search_path, zone_name) @@ -49,7 +52,7 @@ def _find_and_read_tzfile_tzpath( def _find_and_read_tzfile_tzdata( zone_name: str, -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: import importlib.resources package_base = "tzdata.zoneinfo" @@ -78,7 +81,7 @@ def _find_and_read_tzfile_tzdata( def _read_tzfile_as_columns( tzdir, zone_name: str -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: transition_times_and_offsets = make_timezone_transition_table( tzdir, zone_name ) @@ -92,7 +95,7 @@ def _read_tzfile_as_columns( def check_ambiguous_and_nonexistent( ambiguous: Literal["NaT"], nonexistent: Literal["NaT"] -) -> Tuple[Literal["NaT"], Literal["NaT"]]: +) -> tuple[Literal["NaT"], Literal["NaT"]]: if ambiguous != "NaT": raise NotImplementedError( "Only ambiguous='NaT' is currently supported" diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index ef6b10f66c1..44ce0ddef25 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,18 +1,17 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations import warnings -from typing import Tuple, Union +from typing import TYPE_CHECKING import numpy as np import cudf -from cudf._typing import ScalarLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, is_scalar, ) -from cudf.core.column import ColumnBase from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( _can_cast, @@ -21,6 +20,10 @@ is_mixed_with_object_dtype, ) +if TYPE_CHECKING: + from cudf._typing import ScalarLike + from cudf.core.column import ColumnBase + def _normalize_categorical(input_col, other): if isinstance(input_col, cudf.core.column.CategoricalColumn): @@ -41,9 +44,9 @@ def _normalize_categorical(input_col, other): def _check_and_cast_columns_with_other( source_col: ColumnBase, - other: Union[ScalarLike, ColumnBase], + other: ScalarLike | ColumnBase, inplace: bool, -) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: +) -> tuple[ColumnBase, ScalarLike | ColumnBase]: # Returns type-casted `source_col` & `other` based on `inplace`. source_dtype = source_col.dtype if isinstance(source_dtype, CategoricalDtype): diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index bf6f9f1a3c1..80dbbe4c048 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -6,7 +6,7 @@ import pickle import weakref from types import SimpleNamespace -from typing import Any, Dict, Literal, Mapping, Optional, Tuple +from typing import Any, Literal, Mapping import numpy from typing_extensions import Self @@ -42,7 +42,7 @@ def host_memory_allocation(nbytes: int) -> memoryview: def cuda_array_interface_wrapper( ptr: int, size: int, - owner: Optional[object] = None, + owner: object | None = None, readonly=False, typestr="|u1", version=0, @@ -278,7 +278,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int: return self._ptr def memoryview( - self, *, offset: int = 0, size: Optional[int] = None + self, *, offset: int = 0, size: int | None = None ) -> memoryview: """Read-only access to the buffer through host memory.""" size = self._size if size is None else size @@ -319,7 +319,7 @@ def __init__( *, owner: BufferOwner, offset: int = 0, - size: Optional[int] = None, + size: int | None = None, ) -> None: size = owner.size if size is None else size if size < 0: @@ -414,7 +414,7 @@ def __cuda_array_interface__(self) -> Mapping: "version": 0, } - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: """Serialize the buffer into header and frames. The frames can be a mixture of memoryview, Buffer, and BufferOwner @@ -427,7 +427,7 @@ def serialize(self) -> Tuple[dict, list]: serializable metadata required to reconstruct the object. The second element is a list containing single frame. """ - header: Dict[str, Any] = {} + header: dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) header["owner-type-serialized"] = pickle.dumps(type(self._owner)) header["frame_count"] = 1 @@ -480,7 +480,7 @@ def __str__(self) -> str: ) -def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]: +def get_ptr_and_size(array_interface: Mapping) -> tuple[int, int]: """Retrieve the pointer and size from an array interface. Raises ValueError if array isn't C-contiguous. diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py index 15f00fc670d..0bd8d6054b3 100644 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Literal, Mapping, Optional +from typing import Literal, Mapping from typing_extensions import Self @@ -27,7 +27,7 @@ def __init__( self, owner: BufferOwner, offset: int = 0, - size: Optional[int] = None, + size: int | None = None, ) -> None: super().__init__(owner=owner, offset=offset, size=size) self.owner._slices.add(self) diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index 7bcf97302aa..762cd7f9e86 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -13,7 +13,7 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import partial -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING import rmm.mr @@ -39,7 +39,7 @@ def get_traceback() -> str: def get_rmm_memory_resource_stack( mr: rmm.mr.DeviceMemoryResource, -) -> List[rmm.mr.DeviceMemoryResource]: +) -> list[rmm.mr.DeviceMemoryResource]: """Get the RMM resource stack Parameters @@ -99,14 +99,14 @@ class Expose: total_nbytes: int = 0 spilled_nbytes: int = 0 - spill_totals: Dict[Tuple[str, str], Tuple[int, float]] + spill_totals: dict[tuple[str, str], tuple[int, float]] def __init__(self, level) -> None: self.lock = threading.Lock() self.level = level self.spill_totals = defaultdict(lambda: (0, 0)) # Maps each traceback to a Expose - self.exposes: Dict[str, SpillStatistics.Expose] = {} + self.exposes: dict[str, SpillStatistics.Expose] = {} def log_spill(self, src: str, dst: str, nbytes: int, time: float) -> None: """Log a (un-)spilling event @@ -227,7 +227,7 @@ class SpillManager: def __init__( self, *, - device_memory_limit: Optional[int] = None, + device_memory_limit: int | None = None, statistic_level: int = 0, ) -> None: self._lock = threading.Lock() @@ -298,7 +298,7 @@ def add(self, buffer: SpillableBufferOwner) -> None: def buffers( self, order_by_access_time: bool = False - ) -> Tuple[SpillableBufferOwner, ...]: + ) -> tuple[SpillableBufferOwner, ...]: """Get all managed buffers Parameters @@ -347,7 +347,7 @@ def spill_device_memory(self, nbytes: int) -> int: buf.lock.release() return spilled - def spill_to_device_limit(self, device_limit: Optional[int] = None) -> int: + def spill_to_device_limit(self, device_limit: int | None = None) -> int: """Try to spill device memory until device limit Notice, by default this is a no-op. @@ -402,10 +402,10 @@ def __repr__(self) -> str: # - Initialized to None (spilling disabled) # - Initialized to a SpillManager instance (spilling enabled) _global_manager_uninitialized: bool = True -_global_manager: Optional[SpillManager] = None +_global_manager: SpillManager | None = None -def set_global_manager(manager: Optional[SpillManager]) -> None: +def set_global_manager(manager: SpillManager | None) -> None: """Set the global manager, which if None disables spilling""" global _global_manager, _global_manager_uninitialized @@ -419,7 +419,7 @@ def set_global_manager(manager: Optional[SpillManager]) -> None: _global_manager_uninitialized = False -def get_global_manager() -> Optional[SpillManager]: +def get_global_manager() -> SpillManager | None: """Get the global manager or None if spilling is disabled""" global _global_manager_uninitialized if _global_manager_uninitialized: diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 49258fea9ab..eb57a371965 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -7,7 +7,7 @@ import time import weakref from threading import RLock -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple +from typing import TYPE_CHECKING, Any, Literal import numpy from typing_extensions import Self @@ -88,10 +88,10 @@ class SpillableBufferOwner(BufferOwner): lock: RLock _spill_locks: weakref.WeakSet _last_accessed: float - _ptr_desc: Dict[str, Any] + _ptr_desc: dict[str, Any] _manager: SpillManager - def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None: + def _finalize_init(self, ptr_desc: dict[str, Any]) -> None: """Finish initialization of the spillable buffer This implements the common initialization that `from_device_memory` @@ -297,7 +297,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int: self._last_accessed = time.monotonic() return self._ptr - def memory_info(self) -> Tuple[int, int, str]: + def memory_info(self) -> tuple[int, int, str]: """Get pointer, size, and device type of this buffer. Warning, it is not safe to access the pointer value without @@ -341,7 +341,7 @@ def __cuda_array_interface__(self) -> dict: } def memoryview( - self, *, offset: int = 0, size: Optional[int] = None + self, *, offset: int = 0, size: int | None = None ) -> memoryview: size = self._size if size is None else size with self.lock: @@ -388,11 +388,11 @@ def spillable(self) -> bool: def spill_lock(self, spill_lock: SpillLock) -> None: self._owner.spill_lock(spill_lock=spill_lock) - def memory_info(self) -> Tuple[int, int, str]: + def memory_info(self) -> tuple[int, int, str]: (ptr, _, device_type) = self._owner.memory_info() return (ptr + self._offset, self.nbytes, device_type) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: """Serialize the Buffer Normally, we would use `[self]` as the frames. This would work but @@ -411,8 +411,8 @@ def serialize(self) -> Tuple[dict, list]: given to `.deserialize()`, otherwise we would have a `Buffer` pointing to memory already owned by an existing `SpillableBufferOwner`. """ - header: Dict[str, Any] = {} - frames: List[Buffer | memoryview] + header: dict[str, Any] = {} + frames: list[Buffer | memoryview] with self._owner.lock: header["type-serialized"] = pickle.dumps(self.__class__) header["owner-type-serialized"] = pickle.dumps(type(self._owner)) diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 3346d05ed4a..42a1501c914 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -4,7 +4,7 @@ import threading from contextlib import ContextDecorator -from typing import Any, Dict, Optional, Tuple, Type, Union +from typing import Any from cudf.core.buffer.buffer import ( Buffer, @@ -22,7 +22,7 @@ from cudf.options import get_option -def get_buffer_owner(data: Any) -> Optional[BufferOwner]: +def get_buffer_owner(data: Any) -> BufferOwner | None: """Get the owner of `data`, if one exists Search through the stack of data owners in order to find an @@ -47,10 +47,10 @@ def get_buffer_owner(data: Any) -> Optional[BufferOwner]: def as_buffer( - data: Union[int, Any], + data: int | Any, *, - size: Optional[int] = None, - owner: Optional[object] = None, + size: int | None = None, + owner: object | None = None, exposed: bool = False, ) -> Buffer: """Factory function to wrap `data` in a Buffer object. @@ -117,8 +117,8 @@ def as_buffer( ) # Find the buffer types to return based on the current config - owner_class: Type[BufferOwner] - buffer_class: Type[Buffer] + owner_class: type[BufferOwner] + buffer_class: type[Buffer] if get_global_manager() is not None: owner_class = SpillableBufferOwner buffer_class = SpillableBuffer @@ -161,7 +161,7 @@ def as_buffer( return buffer_class(owner=owner, offset=ptr - base_ptr, size=size) -_thread_spill_locks: Dict[int, Tuple[Optional[SpillLock], int]] = {} +_thread_spill_locks: dict[int, tuple[SpillLock | None, int]] = {} def _push_thread_spill_lock() -> None: @@ -193,7 +193,7 @@ class acquire_spill_lock(ContextDecorator): pushing and popping from `_thread_spill_locks` using its thread ID. """ - def __enter__(self) -> Optional[SpillLock]: + def __enter__(self) -> SpillLock | None: _push_thread_spill_lock() return get_spill_lock() @@ -201,7 +201,7 @@ def __exit__(self, *exc): _pop_thread_spill_lock() -def get_spill_lock() -> Union[SpillLock, None]: +def get_spill_lock() -> SpillLock | None: """Return a spill lock within the context of `acquire_spill_lock` or None Returns None, if spilling is disabled. diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 97c2ce5cf1f..f538180805b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -4,7 +4,7 @@ import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast import numpy as np import pandas as pd @@ -139,7 +139,7 @@ def ordered(self) -> bool: """ return self._column.ordered - def as_ordered(self) -> Optional[SeriesOrIndex]: + def as_ordered(self) -> SeriesOrIndex | None: """ Set the Categorical to be ordered. @@ -175,7 +175,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]: """ return self._return_or_inplace(self._column.as_ordered(ordered=True)) - def as_unordered(self) -> Optional[SeriesOrIndex]: + def as_unordered(self) -> SeriesOrIndex | None: """ Set the Categorical to be unordered. @@ -222,7 +222,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]: """ return self._return_or_inplace(self._column.as_ordered(ordered=False)) - def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: + def add_categories(self, new_categories: Any) -> SeriesOrIndex | None: """ Add new categories. @@ -294,7 +294,7 @@ def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: def remove_categories( self, removals: Any, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Remove the specified categories. @@ -370,7 +370,7 @@ def set_categories( new_categories: Any, ordered: bool = False, rename: bool = False, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Set the categories to the specified new_categories. @@ -443,7 +443,7 @@ def reorder_categories( self, new_categories: Any, ordered: bool = False, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Reorder categories as specified in new_categories. @@ -521,8 +521,8 @@ class CategoricalColumn(column.ColumnBase): """ dtype: cudf.core.dtypes.CategoricalDtype - _codes: Optional[NumericalColumn] - _children: Tuple[NumericalColumn] + _codes: NumericalColumn | None + _children: tuple[NumericalColumn] _VALID_REDUCTIONS = { "max", "min", @@ -539,11 +539,11 @@ class CategoricalColumn(column.ColumnBase): def __init__( self, dtype: CategoricalDtype, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, - children: Tuple["column.ColumnBase", ...] = (), + null_count: int | None = None, + children: tuple["column.ColumnBase", ...] = (), ): if size is None: for child in children: @@ -590,23 +590,23 @@ def set_base_data(self, value): def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: lhs = self # We need to convert values to same type as self, # hence passing dtype=self.dtype rhs = cudf.core.column.as_column(values, dtype=self.dtype) return lhs, rhs - def set_base_mask(self, value: Optional[Buffer]): + def set_base_mask(self, value: Buffer | None): super().set_base_mask(value) self._codes = None - def set_base_children(self, value: Tuple[ColumnBase, ...]): + def set_base_children(self, value: tuple[ColumnBase, ...]): super().set_base_children(value) self._codes = None @property - def children(self) -> Tuple[NumericalColumn]: + def children(self) -> tuple[NumericalColumn]: if self._children is None: codes_column = self.base_children[0] start = self.offset * codes_column.dtype.itemsize @@ -693,9 +693,7 @@ def _fill( libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) return result - def slice( - self, start: int, stop: int, stride: Optional[int] = None - ) -> Self: + def slice(self, start: int, stop: int, stride: int | None = None) -> Self: codes = self.codes.slice(start, stop, stride) return cast( Self, @@ -714,7 +712,7 @@ def slice( def _reduce( self, op: str, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, *args, **kwargs, @@ -1073,7 +1071,7 @@ def notnull(self) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """ Fill null values with *fill_value* @@ -1207,7 +1205,7 @@ def memory_usage(self) -> int: def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: out = super()._mimic_inplace(other_col, inplace=inplace) if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col._codes @@ -1468,7 +1466,7 @@ def _create_empty_categorical_column( def pandas_categorical_as_column( - categorical: ColumnLike, codes: Optional[ColumnLike] = None + categorical: ColumnLike, codes: ColumnLike | None = None ) -> CategoricalColumn: """Creates a CategoricalColumn from a pandas.Categorical diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index dc937dc0469..c4e715aeb45 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -7,19 +7,7 @@ from functools import cached_property from itertools import chain from types import SimpleNamespace -from typing import ( - TYPE_CHECKING, - Any, - Dict, - List, - Literal, - MutableSequence, - Optional, - Sequence, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast import cupy import numpy as np @@ -394,7 +382,7 @@ def _fill( begin: int, end: int, inplace: bool = False, - ) -> Optional[Self]: + ) -> Self | None: if end <= begin or begin >= self.size: return self if inplace else self.copy() @@ -532,9 +520,7 @@ def element_indexing(self, index: int): raise IndexError("single positional indexer is out-of-bounds") return libcudf.copying.get_element(self, idx).value - def slice( - self, start: int, stop: int, stride: Optional[int] = None - ) -> Self: + def slice(self, start: int, stop: int, stride: int | None = None) -> Self: stride = 1 if stride is None else stride if start < 0: start = start + len(self) @@ -570,7 +556,7 @@ def __setitem__(self, key: Any, value: Any): else as_column(value, dtype=self.dtype) ) - out: Optional[ColumnBase] # If None, no need to perform mimic inplace. + out: ColumnBase | None # If None, no need to perform mimic inplace. if isinstance(key, slice): out = self._scatter_by_slice(key, value_normalized) else: @@ -593,8 +579,8 @@ def _wrap_binop_normalization(self, other): def _scatter_by_slice( self, key: builtins.slice, - value: Union[cudf.core.scalar.Scalar, ColumnBase], - ) -> Optional[Self]: + value: cudf.core.scalar.Scalar | ColumnBase, + ) -> Self | None: """If this function returns None, it's either a no-op (slice is empty), or the inplace replacement is already performed (fill-in-place). """ @@ -630,7 +616,7 @@ def _scatter_by_slice( def _scatter_by_column( self, key: cudf.core.column.NumericalColumn, - value: Union[cudf.core.scalar.Scalar, ColumnBase], + value: cudf.core.scalar.Scalar | ColumnBase, ) -> Self: if is_bool_dtype(key.dtype): # `key` is boolean mask @@ -667,7 +653,7 @@ def _scatter_by_column( ]._with_type_metadata(self.dtype) def _check_scatter_key_length( - self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase] + self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase ) -> None: """`num_keys` is the number of keys to scatter. Should equal to the number of rows in ``value`` if ``value`` is a column. @@ -682,7 +668,7 @@ def _check_scatter_key_length( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """Fill null values with ``value``. @@ -740,7 +726,7 @@ def indices_of( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask )[0] - def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: + def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: indices = self.indices_of(value) if n := len(indices): return ( @@ -856,7 +842,7 @@ def isin(self, values: Sequence) -> ColumnBase: def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: """ Helper function for `isin` which pre-process `values` based on `self`. """ @@ -868,7 +854,7 @@ def _process_values_for_isin( rhs = rhs.astype(lhs.dtype) return lhs, rhs - def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]: + def _isin_earlystop(self, rhs: ColumnBase) -> ColumnBase | None: """ Helper function for `isin` which determines possibility of early-stopping or not. @@ -1070,7 +1056,7 @@ def as_string_column( def as_decimal_column( self, dtype: Dtype - ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]: + ) -> "cudf.core.column.decimal.DecimalBaseColumn": raise NotImplementedError def apply_boolean_mask(self, mask) -> ColumnBase: @@ -1154,7 +1140,7 @@ def unique(self) -> ColumnBase: self.dtype ) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: # data model: # Serialization produces a nested metadata "header" and a flattened @@ -1167,7 +1153,7 @@ def serialize(self) -> Tuple[dict, list]: # cudf native or foreign some special-casing is required here for # serialization. - header: Dict[Any, Any] = {} + header: dict[Any, Any] = {} frames = [] header["type-serialized"] = pickle.dumps(type(self)) try: @@ -1200,7 +1186,7 @@ def serialize(self) -> Tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list) -> ColumnBase: - def unpack(header, frames) -> Tuple[Any, list]: + def unpack(header, frames) -> tuple[Any, list]: count = header["frame_count"] klass = pickle.loads(header["type-serialized"]) obj = klass.deserialize(header, frames[:count]) @@ -1247,13 +1233,13 @@ def nans_to_nulls(self: Self) -> Self: def normalize_binop_value( self, other: ScalarLike - ) -> Union[ColumnBase, ScalarLike]: + ) -> ColumnBase | ScalarLike: raise NotImplementedError def _reduce( self, op: str, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, *args, **kwargs, @@ -1274,8 +1260,8 @@ def _reduce( return preprocessed def _process_for_reduction( - self, skipna: Optional[bool] = None, min_count: int = 0 - ) -> Union[ColumnBase, ScalarLike]: + self, skipna: bool | None = None, min_count: int = 0 + ) -> ColumnBase | ScalarLike: if skipna is None: skipna = True @@ -1315,8 +1301,8 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: def _label_encoding( self, cats: ColumnBase, - dtype: Optional[Dtype] = None, - na_sentinel: Optional[ScalarLike] = None, + dtype: Dtype | None = None, + na_sentinel: ScalarLike | None = None, ): """ Convert each value in `self` into an integer code, with `cats` @@ -1389,9 +1375,9 @@ def _return_sentinel_column(): def column_empty_like( column: ColumnBase, - dtype: Optional[Dtype] = None, + dtype: Dtype | None = None, masked: bool = False, - newsize: Optional[int] = None, + newsize: int | None = None, ) -> ColumnBase: """Allocate a new column like the given *column*""" if dtype is None: @@ -1446,7 +1432,7 @@ def column_empty( ) -> ColumnBase: """Allocate a new column like the given row_count and dtype.""" dtype = cudf.dtype(dtype) - children = () # type: Tuple[ColumnBase, ...] + children: tuple[ColumnBase, ...] = () if isinstance(dtype, StructDtype): data = None @@ -1496,14 +1482,14 @@ def column_empty( def build_column( - data: Union[Buffer, None], + data: Buffer | None, dtype: Dtype, *, - size: Optional[int] = None, - mask: Optional[Buffer] = None, + size: int | None = None, + mask: Buffer | None = None, offset: int = 0, - null_count: Optional[int] = None, - children: Tuple[ColumnBase, ...] = (), + null_count: int | None = None, + children: tuple[ColumnBase, ...] = (), ) -> ColumnBase: """ Build a Column of the appropriate type from the given parameters @@ -1665,10 +1651,10 @@ def build_column( def build_categorical_column( categories: ColumnBase, codes: ColumnBase, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ordered: bool = False, ) -> "cudf.core.column.CategoricalColumn": """ @@ -1715,7 +1701,7 @@ def check_invalid_array(shape: tuple, dtype): raise TypeError("Unsupported type float16") -def as_memoryview(arbitrary: Any) -> Optional[memoryview]: +def as_memoryview(arbitrary: Any) -> memoryview | None: try: return memoryview(arbitrary) except TypeError: @@ -1724,9 +1710,9 @@ def as_memoryview(arbitrary: Any) -> Optional[memoryview]: def as_column( arbitrary: Any, - nan_as_null: Optional[bool] = None, - dtype: Optional[Dtype] = None, - length: Optional[int] = None, + nan_as_null: bool | None = None, + dtype: Dtype | None = None, + length: int | None = None, ): """Create a Column from an arbitrary object @@ -2199,7 +2185,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") -def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: +def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: """ Return the headers and frames resulting from serializing a list of Column @@ -2216,7 +2202,7 @@ def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: frames : list list of frames """ - headers: List[Dict[Any, Any]] = [] + headers: list[dict[Any, Any]] = [] frames = [] if len(columns) > 0: @@ -2228,7 +2214,7 @@ def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: return headers, frames -def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: +def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]: """ Construct a list of Columns from a list of headers and frames. diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index e24d85bfedf..7fdebda7d76 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,7 +8,7 @@ import locale import re from locale import nl_langinfo -from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Literal, Sequence, cast import numpy as np import pandas as pd @@ -242,10 +242,10 @@ def __init__( self, data: Buffer, dtype: DtypeObj, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make non-optional + mask: Buffer | None = None, + size: int | None = None, # TODO: make non-optional offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) if dtype.kind != "M": @@ -499,7 +499,7 @@ def mean( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype: Dtype = np.float64, ddof: int = 1, @@ -511,7 +511,7 @@ def std( * _unit_to_nanoseconds_conversion[self.time_unit], ).as_unit(self.time_unit) - def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: + def median(self, skipna: bool | None = None) -> pd.Timestamp: return pd.Timestamp( self.as_numerical_column("int64").median(skipna=skipna), unit=self.time_unit, @@ -631,7 +631,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): @@ -703,7 +703,7 @@ def _with_type_metadata(self, dtype): def _find_ambiguous_and_nonexistent( self, zone_name: str - ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]: + ) -> tuple[NumericalColumn, NumericalColumn] | tuple[bool, bool]: """ Recognize ambiguous and nonexistent timestamps for the given timezone. @@ -822,10 +822,10 @@ def __init__( self, data: Buffer, dtype: pd.DatetimeTZDtype, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): super().__init__( data=data, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 9c1bedc9926..e9d9b4933e5 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -4,7 +4,7 @@ import warnings from decimal import Decimal -from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast +from typing import TYPE_CHECKING, Any, Sequence, cast import cupy as cp import numpy as np @@ -49,7 +49,7 @@ def __cuda_array_interface__(self): def as_decimal_column( self, dtype: Dtype, - ) -> Union["DecimalBaseColumn"]: + ) -> "DecimalBaseColumn": if ( isinstance(dtype, cudf.core.dtypes.DecimalDtype) and dtype.scale < self.dtype.scale @@ -138,7 +138,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """Fill null values with ``value``. @@ -199,7 +199,7 @@ def normalize_binop_value(self, other): return NotImplemented def _decimal_quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, q: float | Sequence[float], interpolation: str, exact: bool ) -> ColumnBase: quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 080ba949d62..c548db67344 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Sequence import numpy as np import pandas as pd @@ -167,7 +167,7 @@ def set_base_data(self, value): else: super().set_base_data(value) - def set_base_children(self, value: Tuple[ColumnBase, ...]): + def set_base_children(self, value: tuple[ColumnBase, ...]): super().set_base_children(value) _, values = value self._dtype = cudf.ListDtype(element_type=values.dtype) @@ -269,7 +269,7 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self: # as ``self``, but with the leaf column transformed # by applying ``func`` to it - cc: List[ListColumn] = [] + cc: list[ListColumn] = [] c: ColumnBase = self while isinstance(c, ListColumn): @@ -320,7 +320,7 @@ def __init__(self, parent: ParentType): def get( self, index: int, - default: Optional[Union[ScalarLike, ColumnLike]] = None, + default: ScalarLike | ColumnLike | None = None, ) -> ParentType: """ Extract element at the given index from each list in a Series of lists. @@ -424,7 +424,7 @@ def contains(self, search_key: ScalarLike) -> ParentType: contains_scalar(self._column, cudf.Scalar(search_key)) ) - def index(self, search_key: Union[ScalarLike, ColumnLike]) -> ParentType: + def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: """ Returns integers representing the index of the search key for each row. diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 7f7355c571a..7c6f4e05577 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Optional, Union, overload +from typing import Union, overload from typing_extensions import Literal @@ -52,7 +52,7 @@ def _return_or_inplace( inplace: bool = False, expand: bool = False, retain_index: bool = True, - ) -> Optional[ParentType]: ... + ) -> ParentType | None: ... def _return_or_inplace( self, new_col, inplace=False, expand=False, retain_index=True diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 6af67e02bb4..098cf43421b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,16 +3,7 @@ from __future__ import annotations import functools -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Optional, - Sequence, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Callable, Sequence, cast import cupy as cp import numpy as np @@ -85,10 +76,10 @@ def __init__( self, data: Buffer, dtype: DtypeObj, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make this non-optional + mask: Buffer | None = None, + size: int | None = None, # TODO: make this non-optional offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) @@ -179,7 +170,7 @@ def __setitem__(self, key: Any, value: Any): else: device_value = device_value.astype(self.dtype) - out: Optional[ColumnBase] # If None, no need to perform mimic inplace. + out: ColumnBase | None # If None, no need to perform mimic inplace. if isinstance(key, slice): out = self._scatter_by_slice(key, device_value) else: @@ -196,7 +187,7 @@ def __setitem__(self, key: Any, value: Any): if out: self._mimic_inplace(out, inplace=True) - def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase: + def unary_operator(self, unaryop: str | Callable) -> ColumnBase: if callable(unaryop): return libcudf.transform.transform(self, unaryop) @@ -302,7 +293,7 @@ def nans_to_nulls(self: Self) -> Self: def normalize_binop_value( self, other: ScalarLike - ) -> Union[ColumnBase, cudf.Scalar]: + ) -> ColumnBase | cudf.Scalar: if isinstance(other, ColumnBase): if not isinstance(other, NumericalColumn): return NotImplemented @@ -422,7 +413,7 @@ def nan_count(self) -> int: def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: lhs = cast("cudf.core.column.ColumnBase", self) try: rhs = as_column(values, nan_as_null=False) @@ -456,12 +447,12 @@ def _process_values_for_isin( return lhs, rhs - def _can_return_nan(self, skipna: Optional[bool] = None) -> bool: + def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls(include_nan=True) def _process_for_reduction( - self, skipna: Optional[bool] = None, min_count: int = 0 - ) -> Union[NumericalColumn, ScalarLike]: + self, skipna: bool | None = None, min_count: int = 0 + ) -> NumericalColumn | ScalarLike: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): @@ -544,7 +535,7 @@ def find_and_replace( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """ Fill null values with *fill_value* @@ -730,7 +721,7 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: def _normalize_find_and_replace_input( - input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list] + input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list ) -> ColumnBase: normalized_column = column.as_column( col_to_normalize, diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index bd48054a951..95c78c5efcb 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, cast +from typing import TYPE_CHECKING, cast import numpy as np @@ -42,10 +42,10 @@ class NumericalBaseColumn(ColumnBase, Scannable): "cummax", } - def _can_return_nan(self, skipna: Optional[bool] = None) -> bool: + def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls() - def kurtosis(self, skipna: Optional[bool] = None) -> float: + def kurtosis(self, skipna: bool | None = None) -> float: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): @@ -70,7 +70,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float: kurt = term_one_section_one * term_one_section_two - 3 * term_two return kurt - def skew(self, skipna: Optional[bool] = None) -> ScalarLike: + def skew(self, skipna: bool | None = None) -> ScalarLike: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): @@ -142,7 +142,7 @@ def quantile( def mean( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ): @@ -152,7 +152,7 @@ def mean( def var( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ddof=1, @@ -163,7 +163,7 @@ def var( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ddof=1, @@ -172,7 +172,7 @@ def std( "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) - def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn: + def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 87df2d2f1f1..2451a9cc0af 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,16 +5,7 @@ import re import warnings from functools import cached_property -from typing import ( - TYPE_CHECKING, - Any, - Optional, - Sequence, - Tuple, - Union, - cast, - overload, -) +from typing import TYPE_CHECKING, Any, Sequence, cast, overload import numpy as np import pandas as pd @@ -257,13 +248,13 @@ def byte_count(self) -> SeriesOrIndex: @overload def cat( - self, sep: Optional[str] = None, na_rep: Optional[str] = None + self, sep: str | None = None, na_rep: str | None = None ) -> str: ... @overload def cat( - self, others, sep: Optional[str] = None, na_rep: Optional[str] = None - ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ... + self, others, sep: str | None = None, na_rep: str | None = None + ) -> SeriesOrIndex | "cudf.core.column.string.StringColumn": ... def cat(self, others=None, sep=None, na_rep=None): """ @@ -641,7 +632,7 @@ def extract( def contains( self, - pat: Union[str, Sequence], + pat: str | Sequence, case: bool = True, flags: int = 0, na=np.nan, @@ -792,7 +783,7 @@ def contains( result_col = libstrings.contains_multiple(input_column, pat) return self._return_or_inplace(result_col) - def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: + def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: """ Test if a like pattern matches a string of a Series or Index. @@ -863,7 +854,7 @@ def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: def repeat( self, - repeats: Union[int, Sequence], + repeats: int | Sequence, ) -> SeriesOrIndex: """ Duplicate each string in the Series or Index. @@ -920,8 +911,8 @@ def repeat( def replace( self, - pat: Union[str, Sequence], - repl: Union[str, Sequence], + pat: str | Sequence, + repl: str | Sequence, n: int = -1, case=None, flags: int = 0, @@ -1074,9 +1065,9 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: def slice( self, - start: Optional[int] = None, - stop: Optional[int] = None, - step: Optional[int] = None, + start: int | None = None, + stop: int | None = None, + step: int | None = None, ) -> SeriesOrIndex: """ Slice substrings from each element in the Series or Index. @@ -2051,7 +2042,7 @@ def istitle(self) -> SeriesOrIndex: return self._return_or_inplace(libstrings.is_title(self._column)) def filter_alphanum( - self, repl: Optional[str] = None, keep: bool = True + self, repl: str | None = None, keep: bool = True ) -> SeriesOrIndex: """ Remove non-alphanumeric characters from strings in this column. @@ -2138,9 +2129,9 @@ def slice_from( def slice_replace( self, - start: Optional[int] = None, - stop: Optional[int] = None, - repl: Optional[str] = None, + start: int | None = None, + stop: int | None = None, + repl: str | None = None, ) -> SeriesOrIndex: """ Replace the specified section of each string with a new string. @@ -2228,9 +2219,7 @@ def slice_replace( ), ) - def insert( - self, start: int = 0, repl: Optional[str] = None - ) -> SeriesOrIndex: + def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: """ Insert the specified string into each string in the specified position. @@ -2410,10 +2399,10 @@ def get_json_object( def split( self, - pat: Optional[str] = None, + pat: str | None = None, n: int = -1, expand: bool = False, - regex: Optional[bool] = None, + regex: bool | None = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2578,10 +2567,10 @@ def split( def rsplit( self, - pat: Optional[str] = None, + pat: str | None = None, n: int = -1, expand: bool = False, - regex: Optional[bool] = None, + regex: bool | None = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -3233,7 +3222,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: libstrings.rjust(self._column, width, fillchar) ) - def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def strip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3292,7 +3281,7 @@ def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: libstrings.strip(self._column, cudf.Scalar(to_strip, "str")) ) - def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3339,7 +3328,7 @@ def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str")) ) - def rstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3844,7 +3833,7 @@ def endswith(self, pat: str) -> SeriesOrIndex: return self._return_or_inplace(result_col) - def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: + def startswith(self, pat: str | Sequence) -> SeriesOrIndex: """ Test if the start of each string element matches a pattern. @@ -3996,7 +3985,7 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: return self._return_or_inplace(result) def find( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return lowest indexes in each strings in the Series/Index @@ -4053,7 +4042,7 @@ def find( return self._return_or_inplace(result_col) def rfind( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return highest indexes in each strings in the Series/Index @@ -4114,7 +4103,7 @@ def rfind( return self._return_or_inplace(result_col) def index( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return lowest indexes in each strings where the substring @@ -4176,7 +4165,7 @@ def index( return result def rindex( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return highest indexes in each strings where the substring @@ -4443,7 +4432,7 @@ def translate(self, table: dict) -> SeriesOrIndex: ) def filter_characters( - self, table: dict, keep: bool = True, repl: Optional[str] = None + self, table: dict, keep: bool = True, repl: str | None = None ) -> SeriesOrIndex: """ Remove characters from each string using the character ranges @@ -4924,7 +4913,7 @@ def ngrams_tokenize( ) def replace_tokens( - self, targets, replacements, delimiter: Optional[str] = None + self, targets, replacements, delimiter: str | None = None ) -> SeriesOrIndex: """ The targets tokens are searched for within each string in the series @@ -5009,8 +4998,8 @@ def replace_tokens( def filter_tokens( self, min_token_length: int, - replacement: Optional[str] = None, - delimiter: Optional[str] = None, + replacement: str | None = None, + delimiter: str | None = None, ) -> SeriesOrIndex: """ Remove tokens from within each string in the series that are @@ -5279,7 +5268,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: ) def minhash( - self, seeds: Optional[ColumnLike] = None, width: int = 4 + self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5322,7 +5311,7 @@ def minhash( ) def minhash64( - self, seeds: Optional[ColumnLike] = None, width: int = 4 + self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5436,8 +5425,8 @@ class StringColumn(column.ColumnBase): respectively """ - _start_offset: Optional[int] - _end_offset: Optional[int] + _start_offset: int | None + _end_offset: int | None _VALID_BINARY_OPERATIONS = { "__eq__", @@ -5461,12 +5450,12 @@ class StringColumn(column.ColumnBase): def __init__( self, - data: Optional[Buffer] = None, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make non-optional + data: Buffer | None = None, + mask: Buffer | None = None, + size: int | None = None, # TODO: make non-optional offset: int = 0, - null_count: Optional[int] = None, - children: Tuple["column.ColumnBase", ...] = (), + null_count: int | None = None, + children: tuple["column.ColumnBase", ...] = (), ): dtype = cudf.api.types.dtype("object") @@ -5634,8 +5623,8 @@ def to_arrow(self) -> pa.Array: def sum( self, - skipna: Optional[bool] = None, - dtype: Optional[Dtype] = None, + skipna: bool | None = None, + dtype: Dtype | None = None, min_count: int = 0, ): result_col = self._process_for_reduction( @@ -5852,7 +5841,7 @@ def find_and_replace( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if not is_scalar(fill_value): @@ -5864,9 +5853,7 @@ def fillna( fill_value = cudf.Scalar(fill_value, dtype=self.dtype) return super().fillna(fill_value, method=method) - def normalize_binop_value( - self, other - ) -> Union[column.ColumnBase, cudf.Scalar]: + def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar: if ( isinstance(other, (column.ColumnBase, cudf.Scalar)) and other.dtype == "object" @@ -5930,8 +5917,8 @@ def _binaryop( # Explicit types are necessary because mypy infers ColumnBase # rather than StringColumn and sometimes forgets Scalar. - lhs: Union[cudf.Scalar, StringColumn] - rhs: Union[cudf.Scalar, StringColumn] + lhs: cudf.Scalar | StringColumn + rhs: cudf.Scalar | StringColumn lhs, rhs = (other, self) if reflect else (self, other) return cast( diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 0af847f38af..8eec84b64f7 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,7 +4,7 @@ import datetime import functools -from typing import TYPE_CHECKING, Any, Optional, Sequence, cast +from typing import TYPE_CHECKING, Any, Sequence, cast import numpy as np import pandas as pd @@ -77,10 +77,10 @@ def __init__( self, data: Buffer, dtype: Dtype, - size: Optional[int] = None, # TODO: make non-optional - mask: Optional[Buffer] = None, + size: int | None = None, # TODO: make non-optional + mask: Buffer | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) if dtype.kind != "m": @@ -255,7 +255,7 @@ def time_unit(self) -> str: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): @@ -316,7 +316,7 @@ def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: unit=self.time_unit, ).as_unit(self.time_unit) - def median(self, skipna: Optional[bool] = None) -> pd.Timedelta: + def median(self, skipna: bool | None = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical_column("int64").median(skipna=skipna), unit=self.time_unit, @@ -346,9 +346,9 @@ def quantile( def sum( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, - dtype: Optional[Dtype] = None, + dtype: Dtype | None = None, ) -> pd.Timedelta: return pd.Timedelta( # Since sum isn't overridden in Numerical[Base]Column, mypy only @@ -362,7 +362,7 @@ def sum( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype: Dtype = np.float64, ddof: int = 1, diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 9f3de061ee8..1bf9a393566 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -6,16 +6,7 @@ import sys from collections import abc from functools import cached_property, reduce -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Mapping, - Optional, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Mapping import numpy as np import pandas as pd @@ -98,13 +89,13 @@ class ColumnAccessor(abc.MutableMapping): column length and type """ - _data: "Dict[Any, ColumnBase]" + _data: "dict[Any, ColumnBase]" multiindex: bool - _level_names: Tuple[Any, ...] + _level_names: tuple[Any, ...] def __init__( self, - data: Union[abc.MutableMapping, ColumnAccessor, None] = None, + data: abc.MutableMapping | ColumnAccessor | None = None, multiindex: bool = False, level_names=None, rangeindex: bool = False, @@ -210,7 +201,7 @@ def _from_columns_like_self( ) @property - def level_names(self) -> Tuple[Any, ...]: + def level_names(self) -> tuple[Any, ...]: if self._level_names is None or len(self._level_names) == 0: return tuple((None,) * max(1, self.nlevels)) else: @@ -237,11 +228,11 @@ def nrows(self) -> int: return len(next(iter(self.values()))) @cached_property - def names(self) -> Tuple[Any, ...]: + def names(self) -> tuple[Any, ...]: return tuple(self.keys()) @cached_property - def columns(self) -> Tuple[ColumnBase, ...]: + def columns(self) -> tuple[ColumnBase, ...]: return tuple(self.values()) @cached_property @@ -610,7 +601,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any: return key + (pad_value,) * (self.nlevels - len(key)) def rename_levels( - self, mapper: Union[Mapping[Any, Any], Callable], level: Optional[int] + self, mapper: Mapping[Any, Any] | Callable, level: int | None ) -> ColumnAccessor: """ Rename the specified levels of the given ColumnAccessor diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 80260c7699b..065b13561ab 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -14,20 +14,7 @@ import warnings from collections import abc, defaultdict from collections.abc import Iterator -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Literal, - MutableMapping, - Optional, - Set, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping, cast import cupy import numba @@ -684,7 +671,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): """ _PROTECTED_KEYS = frozenset(("_data", "_index")) - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _loc_indexer_type = _DataFrameLocIndexer _iloc_indexer_type = _DataFrameIlocIndexer _groupby = DataFrameGroupBy @@ -1123,7 +1110,7 @@ def _init_from_dict_like( def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, columns: Any = None, ) -> DataFrame: out = super()._from_data(data=data, index=index) @@ -1553,7 +1540,7 @@ def _get_numeric_data(self): return self[columns] @_cudf_nvtx_annotate - def assign(self, **kwargs: Union[Callable[[Self], Any], Any]): + def assign(self, **kwargs: Callable[[Self], Any] | Any): """ Assign columns to DataFrame from keyword arguments. @@ -2009,12 +1996,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + BaseIndex | None, bool, ]: lhs, rhs = self._data, other @@ -2119,8 +2104,8 @@ def from_dict( cls, data: dict, orient: str = "columns", - dtype: Optional[Dtype] = None, - columns: Optional[list] = None, + dtype: Dtype | None = None, + columns: list | None = None, ) -> DataFrame: """ Construct DataFrame from dict of array-like or dicts. @@ -4584,7 +4569,7 @@ def apply( def applymap( self, func: Callable[[Any], Any], - na_action: Union[str, None] = None, + na_action: str | None = None, **kwargs, ) -> DataFrame: """ @@ -4617,7 +4602,7 @@ def applymap( def map( self, func: Callable[[Any], Any], - na_action: Union[str, None] = None, + na_action: str | None = None, **kwargs, ) -> DataFrame: """ @@ -7462,7 +7447,7 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) - def nunique(self, axis=0, dropna=True): + def nunique(self, axis=0, dropna: bool = True) -> Series: """ Count number of distinct elements in specified axis. Return Series with number of distinct elements. Can ignore NaN values. @@ -7490,13 +7475,15 @@ def nunique(self, axis=0, dropna=True): """ if axis != 0: raise NotImplementedError("axis parameter is not supported yet.") - - return cudf.Series(super().nunique(dropna=dropna)) + counts = [col.distinct_count(dropna=dropna) for col in self._columns] + return self._constructor_sliced( + counts, index=self._data.to_pandas_index() + ) def _sample_axis_1( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, random_state: np.random.RandomState, ignore_index: bool, @@ -7521,11 +7508,11 @@ def _sample_axis_1( def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, - index_names: Optional[List[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, + index_names: list[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> DataFrame: result = super()._from_columns_like_self( columns, @@ -8126,7 +8113,7 @@ def _setitem_with_dataframe( input_df: DataFrame, replace_df: DataFrame, input_cols: Any = None, - mask: Optional[ColumnBase] = None, + mask: ColumnBase | None = None, ignore_index: bool = False, ): """ diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 62ded8ac6f1..9cd573aceb9 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,17 +1,9 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations import enum from collections import abc -from typing import ( - Any, - Dict, - Iterable, - Mapping, - Optional, - Sequence, - Tuple, - cast, -) +from typing import Any, Iterable, Mapping, Sequence, Tuple, cast import cupy as cp import numpy as np @@ -109,7 +101,7 @@ def __dlpack__(self): except ValueError: raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`") - def __dlpack_device__(self) -> Tuple[_Device, int]: + def __dlpack_device__(self) -> tuple[_Device, int]: """ _Device type and _Device ID for where the data in the buffer resides. """ @@ -265,7 +257,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: return (kind, bitwidth, format_str, endianness) @property - def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: + def describe_categorical(self) -> tuple[bool, bool, dict[int, Any]]: """ If the dtype is categorical, there are two options: @@ -298,7 +290,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: return ordered, is_dictionary, mapping @property - def describe_null(self) -> Tuple[int, Any]: + def describe_null(self) -> tuple[int, Any]: """ Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. @@ -338,7 +330,7 @@ def null_count(self) -> int: return self._col.null_count @property - def metadata(self) -> Dict[str, Any]: + def metadata(self) -> dict[str, Any]: """ Store specific metadata of the column. """ @@ -351,7 +343,7 @@ def num_chunks(self) -> int: return 1 def get_chunks( - self, n_chunks: Optional[int] = None + self, n_chunks: int | None = None ) -> Iterable["_CuDFColumn"]: """ Return an iterable yielding the chunks. @@ -362,7 +354,7 @@ def get_chunks( def get_buffers( self, - ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]: + ) -> Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None]: """ Return a dictionary containing the underlying buffers. @@ -400,7 +392,7 @@ def get_buffers( def _get_validity_buffer( self, - ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + ) -> tuple[_CuDFBuffer, ProtoDtype] | None: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -433,7 +425,7 @@ def _get_validity_buffer( def _get_offsets_buffer( self, - ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + ) -> tuple[_CuDFBuffer, ProtoDtype] | None: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) @@ -461,7 +453,7 @@ def _get_offsets_buffer( def _get_data_buffer( self, - ) -> Tuple[_CuDFBuffer, ProtoDtype]: + ) -> tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the data and the buffer's associated dtype. @@ -588,7 +580,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": ) def get_chunks( - self, n_chunks: Optional[int] = None + self, n_chunks: int | None = None ) -> Iterable["_CuDFDataFrame"]: """ Return an iterator yielding the chunks. @@ -745,9 +737,9 @@ def from_dataframe( def _protocol_to_cudf_column_numeric( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert an int, uint, float or bool protocol column @@ -822,9 +814,9 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: def _protocol_to_cudf_column_categorical( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert a categorical column to a Series instance @@ -857,9 +849,9 @@ def _protocol_to_cudf_column_categorical( def _protocol_to_cudf_column_string( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert a string ColumnObject to cudf Column object. diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index b1282040e60..034849d0e71 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import decimal import operator @@ -6,7 +7,7 @@ import textwrap import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Callable import numpy as np import pandas as pd @@ -16,12 +17,12 @@ from pandas.core.arrays.arrow.extension_types import ArrowIntervalType import cudf -from cudf._typing import Dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable from cudf.utils.docutils import doc_apply if TYPE_CHECKING: + from cudf._typing import Dtype from cudf.core.buffer import Buffer @@ -84,11 +85,11 @@ def dtype(arbitrary): def _decode_type( - cls: Type, + cls: type, header: dict, frames: list, - is_valid_class: Callable[[Type, Type], bool] = operator.is_, -) -> Tuple[dict, list, Type]: + is_valid_class: Callable[[type, type], bool] = operator.is_, +) -> tuple[dict, list, type]: """Decode metadata-encoded type and check validity Parameters @@ -481,8 +482,8 @@ def __repr__(self): def __hash__(self): return hash(self._typ) - def serialize(self) -> Tuple[dict, list]: - header: Dict[str, Dtype] = {} + def serialize(self) -> tuple[dict, list]: + header: dict[str, Dtype] = {} header["type-serialized"] = pickle.dumps(type(self)) frames = [] @@ -627,13 +628,13 @@ def __repr__(self): def __hash__(self): return hash(self._typ) - def serialize(self) -> Tuple[dict, list]: - header: Dict[str, Any] = {} + def serialize(self) -> tuple[dict, list]: + header: dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) - frames: List[Buffer] = [] + frames: list[Buffer] = [] - fields: Dict[str, Union[bytes, Tuple[Any, Tuple[int, int]]]] = {} + fields: dict[str, bytes | tuple[Any, tuple[int, int]]] = {} for k, dtype in self.fields.items(): if isinstance(dtype, _BaseDtype): @@ -823,7 +824,7 @@ def _from_decimal(cls, decimal): precision = max(len(metadata.digits), -metadata.exponent) return cls(precision, -metadata.exponent) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: return ( { "type-serialized": pickle.dumps(type(self)), @@ -946,7 +947,7 @@ def __eq__(self, other): def __hash__(self): return hash((self.subtype, self.closed)) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: header = { "type-serialized": pickle.dumps(type(self)), "fields": pickle.dumps((self.subtype, self.closed)), diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ee310cfcb58..c58a0161ee0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -8,18 +8,7 @@ import pickle import warnings from collections import abc -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Literal, - MutableMapping, - Optional, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. @@ -83,11 +72,11 @@ def _num_rows(self) -> int: return self._data.nrows @property - def _column_names(self) -> Tuple[Any, ...]: + def _column_names(self) -> tuple[Any, ...]: return self._data.names @property - def _columns(self) -> Tuple[ColumnBase, ...]: + def _columns(self) -> tuple[ColumnBase, ...]: return self._data.columns @property @@ -154,10 +143,10 @@ def _from_data_like_self(self, data: MutableMapping) -> Self: @_cudf_nvtx_annotate def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ): """Construct a Frame from a list of columns with metadata from self. @@ -172,7 +161,7 @@ def _from_columns_like_self( @_cudf_nvtx_annotate def _mimic_inplace( self, result: Self, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: if inplace: for col in self._data: if col in result._data: @@ -424,15 +413,15 @@ def _to_array( get_array: Callable, module: ModuleType, copy: bool, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, na_value=None, - ) -> Union[cupy.ndarray, numpy.ndarray]: + ) -> cupy.ndarray | numpy.ndarray: # Internal function to implement to_cupy and to_numpy, which are nearly # identical except for the attribute they access to generate values. def to_array( col: ColumnBase, dtype: np.dtype - ) -> Union[cupy.ndarray, numpy.ndarray]: + ) -> cupy.ndarray | numpy.ndarray: if na_value is not None: col = col.fillna(na_value) array = get_array(col) @@ -485,7 +474,7 @@ def to_array( @_cudf_nvtx_annotate def to_cupy( self, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, copy: bool = False, na_value=None, ) -> cupy.ndarray: @@ -519,7 +508,7 @@ def to_cupy( @_cudf_nvtx_annotate def to_numpy( self, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, copy: bool = True, na_value=None, ) -> numpy.ndarray: @@ -552,7 +541,7 @@ def to_numpy( ) @_cudf_nvtx_annotate - def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: + def where(self, cond, other=None, inplace: bool = False) -> Self | None: """ Replace values where the condition is False. @@ -628,11 +617,11 @@ def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: def fillna( self, value=None, - method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None, + method: Literal["ffill", "bfill", "pad", "backfill"] | None = None, axis=None, inplace: bool = False, limit=None, - ) -> Optional[Self]: + ) -> Self | None: """Fill null values with ``value`` or specified ``method``. Parameters @@ -1047,7 +1036,7 @@ def _copy_type_metadata( self, other: Self, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """ Copy type metadata from each column of `other` to the corresponding @@ -1495,7 +1484,7 @@ def _unaryop(self, op): @_cudf_nvtx_annotate def _colwise_binop( cls, - operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], + operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]], fn: str, ): """Implement binary ops between two frame-like objects. @@ -1903,16 +1892,15 @@ def nunique(self, dropna: bool = True): dict Name and unique value counts of each column in frame. """ - return { - name: col.distinct_count(dropna=dropna) - for name, col in self._data.items() - } + raise NotImplementedError( + f"{type(self).__name__} does not implement nunique" + ) @staticmethod @_cudf_nvtx_annotate def _repeat( - columns: List[ColumnBase], repeats, axis=None - ) -> List[ColumnBase]: + columns: list[ColumnBase], repeats, axis=None + ) -> list[ColumnBase]: if axis is not None: raise NotImplementedError( "Only axis=`None` supported at this time." diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index aa96051ea51..d08268eea3a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import copy import itertools @@ -7,7 +8,7 @@ import warnings from collections import abc from functools import cached_property -from typing import Any, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Iterable import cupy as cp import numpy as np @@ -20,7 +21,6 @@ from cudf._lib.reshape import interleave_columns from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype -from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 @@ -34,6 +34,9 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import GetAttrGetItemMixin +if TYPE_CHECKING: + from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType + def _deprecate_collect(): warnings.warn( @@ -1033,11 +1036,11 @@ def ngroup(self, ascending=True): def sample( self, - n: Optional[int] = None, - frac: Optional[float] = None, + n: int | None = None, + frac: float | None = None, replace: bool = False, - weights: Union[abc.Sequence, "cudf.Series", None] = None, - random_state: Union[np.random.RandomState, int, None] = None, + weights: abc.Sequence | "cudf.Series" | None = None, + random_state: np.random.RandomState | int | None = None, ): """Return a random sample of items in each group. @@ -1222,7 +1225,7 @@ def _grouped(self, *, include_groups: bool = True): def _normalize_aggs( self, aggs: MultiColumnAggType - ) -> Tuple[Iterable[Any], Tuple[ColumnBase, ...], List[List[AggType]]]: + ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]: """ Normalize aggs to a list of list of aggregations, where `out[i]` is a list of aggregations for column `self.obj[i]`. We support three @@ -1237,7 +1240,7 @@ def _normalize_aggs( Each agg can be string or lambda functions. """ - aggs_per_column: Iterable[Union[AggType, Iterable[AggType]]] + aggs_per_column: Iterable[AggType | Iterable[AggType]] if isinstance(aggs, dict): column_names, aggs_per_column = aggs.keys(), aggs.values() columns = tuple(self.obj._data[col] for col in column_names) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 655f7607b37..13fa187842d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -7,17 +7,7 @@ import warnings from functools import cache, cached_property from numbers import Number -from typing import ( - TYPE_CHECKING, - Any, - List, - Literal, - MutableMapping, - Optional, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast import cupy import numpy as np @@ -101,10 +91,10 @@ def __subclasscheck__(self, subclass): def _lexsorted_equal_range( - idx: Union[Index, cudf.MultiIndex], + idx: Index | cudf.MultiIndex, key_as_table: Frame, is_sorted: bool, -) -> Tuple[int, int, Optional[ColumnBase]]: +) -> tuple[int, int, ColumnBase | None]: """Get equal range for key in lexicographically sorted index. If index is not sorted when called, a sort will take place and `sort_inds` is returned. Otherwise `None` is returned in that position. @@ -898,7 +888,7 @@ def __array__(self, dtype=None): ) @_cudf_nvtx_annotate - def nunique(self) -> int: + def nunique(self, dropna: bool = True) -> int: return len(self) @_cudf_nvtx_annotate @@ -2858,7 +2848,7 @@ class IntervalIndex(Index): def __init__( self, data, - closed: Optional[Literal["left", "right", "neither", "both"]] = None, + closed: Literal["left", "right", "neither", "both"] | None = None, dtype=None, copy: bool = False, name=None, @@ -2917,9 +2907,7 @@ def closed(self): def from_breaks( cls, breaks, - closed: Optional[ - Literal["left", "right", "neither", "both"] - ] = "right", + closed: Literal["left", "right", "neither", "both"] | None = "right", name=None, copy: bool = False, dtype=None, @@ -3106,7 +3094,7 @@ def _getdefault_name(values, name): @_cudf_nvtx_annotate -def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: +def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex: """ An internal Utility function to concat RangeIndex objects. """ @@ -3147,7 +3135,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: @_cudf_nvtx_annotate -def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: +def _extended_gcd(a: int, b: int) -> tuple[int, int, int]: """ Extended Euclidean algorithms to solve Bezout's identity: a*x + b*y = gcd(x, y) @@ -3197,7 +3185,7 @@ def _get_nearest_indexer( index: Index, positions: cudf.Series, target_col: cudf.core.column.ColumnBase, - tolerance: Union[int, float], + tolerance: int | float, ): """ Get the indexer for the nearest index labels; requires an index with diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 3a4f4874e35..06da62306e8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -12,15 +12,9 @@ TYPE_CHECKING, Any, Callable, - Dict, - List, Literal, MutableMapping, - Optional, - Tuple, - Type, TypeVar, - Union, cast, ) from uuid import uuid4 @@ -258,8 +252,8 @@ class IndexedFrame(Frame): """ # mypy can't handle bound type variables as class members - _loc_indexer_type: Type[_LocIndexerClass] # type: ignore - _iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore + _loc_indexer_type: type[_LocIndexerClass] # type: ignore + _iloc_indexer_type: type[_IlocIndexerClass] # type: ignore _index: cudf.core.index.BaseIndex _groupby = GroupBy _resampler = _Resampler @@ -294,14 +288,14 @@ def _num_rows(self) -> int: return len(self.index) @property - def _index_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? + def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? return self.index._data.names @classmethod def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, ): out = super()._from_data(data) out._index = RangeIndex(out._data.nrows) if index is None else index @@ -316,11 +310,11 @@ def _from_data_like_self(self, data: MutableMapping): @_cudf_nvtx_annotate def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, - index_names: Optional[List[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, + index_names: list[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """Construct a `Frame` from a list of columns with metadata from self. @@ -368,7 +362,7 @@ def __round__(self, digits=0): def _mimic_inplace( self, result: Self, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: if inplace: self._index = result.index return super()._mimic_inplace(result, inplace) @@ -1788,7 +1782,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): ) @_cudf_nvtx_annotate - def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: + def mask(self, cond, other=None, inplace: bool = False) -> Self | None: """ Replace values where the condition is True. @@ -1924,7 +1918,7 @@ def _copy_type_metadata( other: Self, include_index: bool = True, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """ Copy type metadata from each column of `other` to the corresponding @@ -4670,9 +4664,9 @@ def sample( def _sample_axis_0( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, - random_state: Union[np.random.RandomState, cp.random.RandomState], + random_state: np.random.RandomState | cp.random.RandomState, ignore_index: bool, ): try: @@ -4695,7 +4689,7 @@ def _sample_axis_0( def _sample_axis_1( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, random_state: np.random.RandomState, ignore_index: bool, @@ -4742,12 +4736,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[cudf.BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + cudf.BaseIndex | None, bool, ]: raise NotImplementedError( @@ -6328,8 +6320,8 @@ def _check_duplicate_level_names(specified, level_names): @_cudf_nvtx_annotate def _get_replacement_values_for_columns( - to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] -) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: + to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any] +) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]: """ Returns a per column mapping for the values to be replaced, new values to be replaced with and if all the values are empty. @@ -6354,9 +6346,9 @@ def _get_replacement_values_for_columns( A dict mapping of all columns and the corresponding values to be replaced with. """ - to_replace_columns: Dict[Any, Any] = {} - values_columns: Dict[Any, Any] = {} - all_na_columns: Dict[Any, Any] = {} + to_replace_columns: dict[Any, Any] = {} + values_columns: dict[Any, Any] = {} + all_na_columns: dict[Any, Any] = {} if is_scalar(to_replace) and is_scalar(value): to_replace_columns = {col: [to_replace] for col in columns_dtype_map} @@ -6496,8 +6488,8 @@ def _is_series(obj): @_cudf_nvtx_annotate def _drop_rows_by_labels( obj: DataFrameOrSeries, - labels: Union[ColumnLike, abc.Iterable, str], - level: Union[int, str], + labels: ColumnLike | abc.Iterable | str, + level: int | str, errors: str, ) -> DataFrameOrSeries: """Remove rows specified by `labels`. diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 7242de9964f..73a1cd26367 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -1,9 +1,9 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from __future__ import annotations from dataclasses import dataclass -from typing import Any, List, Tuple, Union +from typing import Any, List, Union from typing_extensions import TypeAlias @@ -59,7 +59,7 @@ class ScalarIndexer: def destructure_iloc_key( - key: Any, frame: Union[cudf.Series, cudf.DataFrame] + key: Any, frame: cudf.Series | cudf.DataFrame ) -> tuple[Any, ...]: """ Destructure a potentially tuple-typed key into row and column indexers. @@ -124,7 +124,7 @@ def destructure_iloc_key( def destructure_dataframe_iloc_indexer( key: Any, frame: cudf.DataFrame -) -> Tuple[Any, Tuple[bool, ColumnLabels]]: +) -> tuple[Any, tuple[bool, ColumnLabels]]: """Destructure an index key for DataFrame iloc getitem. Parameters diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 05cbb4429b9..dd0a4f666a1 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -4,7 +4,7 @@ import warnings from collections import abc -from typing import TYPE_CHECKING, Any, Tuple, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np @@ -51,7 +51,7 @@ def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False): def _match_join_keys( lcol: ColumnBase, rcol: ColumnBase, how: str -) -> Tuple[ColumnBase, ColumnBase]: +) -> tuple[ColumnBase, ColumnBase]: # Casts lcol and rcol to a common dtype for use as join keys. If no casting # is necessary, they are returned as is. @@ -133,7 +133,7 @@ def _match_join_keys( def _match_categorical_dtypes_both( lcol: CategoricalColumn, rcol: CategoricalColumn, how: str -) -> Tuple[ColumnBase, ColumnBase]: +) -> tuple[ColumnBase, ColumnBase]: ltype, rtype = lcol.dtype, rcol.dtype # when both are ordered and both have the same categories, diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index da999441ca3..ce81c1fc5b1 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,7 +2,7 @@ from __future__ import annotations import itertools -from typing import Any, ClassVar, List, Optional +from typing import Any, ClassVar import cudf from cudf import _lib as libcudf @@ -370,7 +370,7 @@ def _merge_results( else: multiindex_columns = False - index: Optional[cudf.BaseIndex] + index: cudf.BaseIndex | None if self._using_right_index: # right_index and left_on index = left_result.index @@ -398,7 +398,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # This is taken care of by using a stable sort here, and (in # pandas-compat mode) reordering the gather maps before # producing the input result. - by: List[Any] = [] + by: list[Any] = [] if self._using_left_index and self._using_right_index: by.extend(result.index._data.columns) if not self._using_left_index: diff --git a/python/cudf/cudf/core/mixins/binops.pyi b/python/cudf/cudf/core/mixins/binops.pyi index 8587b2dea48..6be73e25332 100644 --- a/python/cudf/cudf/core/mixins/binops.pyi +++ b/python/cudf/cudf/core/mixins/binops.pyi @@ -1,12 +1,12 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Any, Set, Tuple, TypeVar +from typing import Any, TypeVar # Note: It may be possible to define a narrower bound here eventually. BinaryOperandType = TypeVar("BinaryOperandType", bound="Any") class BinaryOperand: - _SUPPORTED_BINARY_OPERATIONS: Set + _SUPPORTED_BINARY_OPERATIONS: set def _binaryop(self, other: BinaryOperandType, op: str): ... def __add__(self, other): ... @@ -36,4 +36,4 @@ class BinaryOperand: def __gt__(self, other): ... def __ge__(self, other): ... @staticmethod - def _check_reflected_op(op) -> Tuple[bool, str]: ... + def _check_reflected_op(op) -> tuple[bool, str]: ... diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi index dbaafdb5cd2..1c2126002ad 100644 --- a/python/cudf/cudf/core/mixins/reductions.pyi +++ b/python/cudf/cudf/core/mixins/reductions.pyi @@ -1,9 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Set - class Reducible: - _SUPPORTED_REDUCTIONS: Set + _SUPPORTED_REDUCTIONS: set def sum(self): ... def product(self): ... diff --git a/python/cudf/cudf/core/mixins/scans.pyi b/python/cudf/cudf/core/mixins/scans.pyi index 37995241b1f..5190750c698 100644 --- a/python/cudf/cudf/core/mixins/scans.pyi +++ b/python/cudf/cudf/core/mixins/scans.pyi @@ -1,9 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Set - class Scannable: - _SUPPORTED_SCANS: Set + _SUPPORTED_SCANS: set def cumsum(self): ... def cumprod(self): ... diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 865d9660b1d..832cc003d2e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -10,7 +10,7 @@ from collections import abc from functools import cached_property from numbers import Integral -from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple, Union +from typing import TYPE_CHECKING, Any, MutableMapping import cupy as cp import numpy as np @@ -40,7 +40,7 @@ from cudf._typing import DataFrameOrSeries -def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: +def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray: """Makes best effort to convert an array of indices into a python slice. If the conversion is not possible, return input. `indices` are expected to be valid. @@ -849,9 +849,10 @@ def _index_and_downcast(self, result, index, index_key): def _get_row_major( self, df: DataFrameOrSeries, - row_tuple: Union[ - numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] - ], + row_tuple: numbers.Number + | slice + | tuple[Any, ...] + | list[tuple[Any, ...]], ) -> DataFrameOrSeries: if pd.api.types.is_bool_dtype( list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple @@ -874,9 +875,10 @@ def _get_row_major( @_cudf_nvtx_annotate def _validate_indexer( self, - indexer: Union[ - numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] - ], + indexer: numbers.Number + | slice + | tuple[Any, ...] + | list[tuple[Any, ...]], ): if isinstance(indexer, numbers.Number): return @@ -1749,6 +1751,11 @@ def fillna(self, value): def unique(self): return self.drop_duplicates(keep="first") + @_cudf_nvtx_annotate + def nunique(self, dropna: bool = True) -> int: + mi = self.dropna(how="all") if dropna else self + return len(mi.unique()) + def _clean_nulls_from_index(self): """ Convert all na values(if any) in MultiIndex object diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 53239cb7ea0..903c4fe7df5 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,8 +1,9 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import itertools import warnings -from typing import Dict, Optional +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -10,13 +11,15 @@ import cudf from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype -from cudf._typing import Dtype from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column.categorical import CategoricalColumn from cudf.utils.dtypes import min_unsigned_type +if TYPE_CHECKING: + from cudf._typing import Dtype + _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1} @@ -1217,10 +1220,10 @@ def _get_unique(column, dummy_na): def _one_hot_encode_column( column: ColumnBase, categories: ColumnBase, - prefix: Optional[str], - prefix_sep: Optional[str], - dtype: Optional[Dtype], -) -> Dict[str, ColumnBase]: + prefix: str | None, + prefix_sep: str | None, + dtype: Dtype | None, +) -> dict[str, ColumnBase]: """Encode a single column with one hot encoding. The return dictionary contains pairs of (category, encodings). The keys may be prefixed with `prefix`, separated with category name with `prefix_sep`. The encoding diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ebf6910ca5f..e532948fd11 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,17 +9,7 @@ import warnings from collections import abc from shutil import get_terminal_size -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Literal, - MutableMapping, - Optional, - Set, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Literal, MutableMapping import cupy import numpy as np @@ -285,7 +275,7 @@ class _SeriesLocIndexer(_FrameIndexer): """ @_cudf_nvtx_annotate - def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]: + def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries: if isinstance(arg, pd.MultiIndex): arg = cudf.from_pandas(arg) @@ -464,7 +454,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable): If ``False``, leaves ``np.nan`` values as is. """ - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _loc_indexer_type = _SeriesLocIndexer _iloc_indexer_type = _SeriesIlocIndexer _groupby = SeriesGroupBy @@ -677,7 +667,7 @@ def __init__( def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, name: Any = no_default, ) -> Series: out = super()._from_data(data=data, index=index) @@ -1311,7 +1301,7 @@ def map(self, arg, na_action=None) -> "Series": def _getitem_preprocessed( self, spec: indexing_utils.IndexingSpec, - ) -> Union[Self, ScalarLike]: + ) -> Self | ScalarLike: """Get subset of entries given structured data Parameters @@ -1473,12 +1463,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + BaseIndex | None, bool, ]: # Specialize binops to align indices. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 6fd4e857e02..23a2c828a04 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any from typing_extensions import Self @@ -274,10 +274,10 @@ def _make_operands_for_binop( other: Any, fill_value: Any = None, reflect: bool = False, - ) -> Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ]: + ) -> ( + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType + ): """Generate the dictionary of operands used for a binary operation. Parameters @@ -338,11 +338,9 @@ def nunique(self, dropna: bool = True) -> int: int Number of unique values in the column. """ - if self._column.null_count == len(self): - return 0 return self._column.distinct_count(dropna=dropna) - def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: + def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: # A generic method for getting elements from a column that supports a # wide range of different inputs. This method should only used where # _absolutely_ necessary, since in almost all cases a more specific diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 24c49e3662a..9e59b134b73 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -3,7 +3,6 @@ from __future__ import annotations import warnings -from typing import Union import cupy as cp @@ -60,7 +59,7 @@ def __call__( max_num_rows: int, add_special_tokens: bool = True, padding: str = "max_length", - truncation: Union[bool, str] = False, + truncation: bool | str = False, stride: int = 0, return_tensors: str = "cp", return_token_type_ids: bool = False, diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index f002a838fa9..29130130732 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,9 +1,10 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations import math import re import warnings -from typing import Literal, Optional, Sequence, Union +from typing import Literal, Sequence import cupy as cp import numpy as np @@ -61,7 +62,7 @@ def to_datetime( dayfirst: bool = False, yearfirst: bool = False, utc: bool = False, - format: Optional[str] = None, + format: str | None = None, exact: bool = True, unit: str = "ns", infer_datetime_format: bool = True, @@ -313,7 +314,7 @@ def _process_col( unit: str, dayfirst: bool, infer_datetime_format: bool, - format: Optional[str], + format: str | None, utc: bool, ): if col.dtype.kind == "f": @@ -707,7 +708,7 @@ def _from_freqstr(cls, freqstr: str) -> Self: @classmethod def _from_pandas_ticks_or_weeks( cls, - tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week], + tick: pd.tseries.offsets.Tick | pd.tseries.offsets.Week, ) -> Self: return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n}) @@ -725,7 +726,7 @@ def _maybe_as_fast_pandas_offset(self): def _isin_datetimelike( - lhs: Union[column.TimeDeltaColumn, column.DatetimeColumn], values: Sequence + lhs: column.TimeDeltaColumn | column.DatetimeColumn, values: Sequence ) -> column.ColumnBase: """ Check whether values are contained in the @@ -784,7 +785,7 @@ def date_range( name=None, closed: Literal["left", "right", "both", "neither"] = "both", *, - unit: Optional[str] = None, + unit: str | None = None, ): """Return a fixed frequency DatetimeIndex. diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 72088493074..dffd7db2f71 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -1,5 +1,7 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. -from typing import Any, Dict +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import Any import numba from numba import cuda, types @@ -124,7 +126,7 @@ def __init__(self, dmm, fe_type): super().__init__(dmm, fe_type, members) -call_cuda_functions: Dict[Any, Any] = {} +call_cuda_functions: dict[Any, Any] = {} def _register_cuda_binary_reduction_caller(funcname, lty, rty, retty): diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index bc1f4f2557e..f1704e4ea78 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,8 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import functools import os -from typing import Any, Callable, Dict +from typing import Any, Callable import cachetools import cupy as cp @@ -57,7 +58,7 @@ MASK_BITSIZE = np.dtype("int32").itemsize * 8 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) -launch_arg_getters: Dict[Any, Any] = {} +launch_arg_getters: dict[Any, Any] = {} @functools.cache diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index dbdb2093b72..58b104b84e9 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -10,7 +10,7 @@ from collections import defaultdict from contextlib import ExitStack from functools import partial, reduce -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable from uuid import uuid4 import numpy as np @@ -679,7 +679,7 @@ def read_parquet( return df -def _normalize_filters(filters: list | None) -> List[List[tuple]] | None: +def _normalize_filters(filters: list | None) -> list[list[tuple]] | None: # Utility to normalize and validate the `filters` # argument to `read_parquet` if not filters: @@ -709,7 +709,7 @@ def _validate_predicate(item): def _apply_post_filters( - df: cudf.DataFrame, filters: List[List[tuple]] | None + df: cudf.DataFrame, filters: list[list[tuple]] | None ) -> cudf.DataFrame: """Apply DNF filters to an in-memory DataFrame @@ -738,7 +738,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: ) return ~column.isna() if negate else column.isna() - handlers: Dict[str, Callable] = { + handlers: dict[str, Callable] = { "==": operator.eq, "!=": operator.ne, "<": operator.lt, @@ -1311,7 +1311,7 @@ def __init__( ) -> None: if isinstance(path, str) and path.startswith("s3://"): self.fs_meta = {"is_s3": True, "actual_path": path} - self.dir_: Optional[tempfile.TemporaryDirectory] = ( + self.dir_: tempfile.TemporaryDirectory | None = ( tempfile.TemporaryDirectory() ) self.path = self.dir_.name @@ -1328,12 +1328,12 @@ def __init__( self.partition_cols = partition_cols # Collection of `ParquetWriter`s, and the corresponding # partition_col values they're responsible for - self._chunked_writers: List[ - Tuple[libparquet.ParquetWriter, List[str], str] + self._chunked_writers: list[ + tuple[libparquet.ParquetWriter, list[str], str] ] = [] # Map of partition_col values to their ParquetWriter's index # in self._chunked_writers for reverse lookup - self.path_cw_map: Dict[str, int] = {} + self.path_cw_map: dict[str, int] = {} self.storage_options = storage_options self.filename = file_name_prefix self.max_file_size = max_file_size @@ -1345,7 +1345,7 @@ def __init__( ) self.max_file_size = _parse_bytes(max_file_size) - self._file_sizes: Dict[str, int] = {} + self._file_sizes: dict[str, int] = {} @_cudf_nvtx_annotate def write_table(self, df): diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index efa8eabd8b8..fb5a963f008 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -1,11 +1,14 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import os import textwrap -from collections.abc import Container from contextlib import ContextDecorator from dataclasses import dataclass -from typing import Any, Callable, Dict, Optional +from typing import TYPE_CHECKING, Any, Callable + +if TYPE_CHECKING: + from collections.abc import Container @dataclass @@ -16,7 +19,7 @@ class Option: validator: Callable -_OPTIONS: Dict[str, Option] = {} +_OPTIONS: dict[str, Option] = {} def _env_get_int(name, default): @@ -123,7 +126,7 @@ def _build_option_description(name, opt): ) -def describe_option(name: Optional[str] = None): +def describe_option(name: str | None = None): """Prints the description of an option. If `name` is unspecified, prints the description of all available options. diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 128913e5746..1540c6850e7 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -12,17 +12,7 @@ import warnings from collections.abc import Iterator from enum import IntEnum -from typing import ( - Any, - Callable, - Dict, - Literal, - Mapping, - Optional, - Set, - Tuple, - Type, -) +from typing import Any, Callable, Literal, Mapping import numpy as np @@ -118,12 +108,12 @@ def make_final_proxy_type( *, fast_to_slow: Callable, slow_to_fast: Callable, - module: Optional[str] = None, + module: str | None = None, additional_attributes: Mapping[str, Any] | None = None, postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None, - bases: Tuple = (), - metaclasses: Tuple = (), -) -> Type[_FinalProxy]: + bases: tuple = (), + metaclasses: tuple = (), +) -> type[_FinalProxy]: """ Defines a fast-slow proxy type for a pair of "final" fast and slow types. Final types are types for which known operations exist for @@ -270,8 +260,8 @@ def make_intermediate_proxy_type( fast_type: type, slow_type: type, *, - module: Optional[str] = None, -) -> Type[_IntermediateProxy]: + module: str | None = None, +) -> type[_IntermediateProxy]: """ Defines a proxy type for a pair of "intermediate" fast and slow types. Intermediate types are the types of the results of @@ -613,13 +603,13 @@ class _IntermediateProxy(_FastSlowProxy): `make_intermediate_proxy_type` to create subtypes. """ - _method_chain: Tuple[Callable, Tuple, Dict] + _method_chain: tuple[Callable, tuple, dict] @classmethod def _fsproxy_wrap( cls, obj: Any, - method_chain: Tuple[Callable, Tuple, Dict], + method_chain: tuple[Callable, tuple, dict], ): """ Parameters @@ -955,7 +945,7 @@ def _fast_slow_function_call( def _transform_arg( arg: Any, attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], - seen: Set[int], + seen: set[int], ) -> Any: """ Transform "arg" into its corresponding slow (or fast) type. @@ -1052,7 +1042,7 @@ def _fast_arg(arg: Any) -> Any: """ Transform "arg" into its corresponding fast type. """ - seen: Set[int] = set() + seen: set[int] = set() return _transform_arg(arg, "_fsproxy_fast", seen) @@ -1060,7 +1050,7 @@ def _slow_arg(arg: Any) -> Any: """ Transform "arg" into its corresponding slow type. """ - seen: Set[int] = set() + seen: set[int] = set() return _transform_arg(arg, "_fsproxy_slow", seen) @@ -1137,7 +1127,7 @@ def _is_function_or_method(obj: Any) -> bool: def _replace_closurevars( f: types.FunctionType, attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], - seen: Set[int], + seen: set[int], ) -> Callable[..., Any]: """ Return a copy of `f` with its closure variables replaced with @@ -1199,10 +1189,10 @@ def is_proxy_object(obj: Any) -> bool: return False -NUMPY_TYPES: Set[str] = set(np.sctypeDict.values()) +NUMPY_TYPES: set[str] = set(np.sctypeDict.values()) -_SPECIAL_METHODS: Set[str] = { +_SPECIAL_METHODS: set[str] = { "__abs__", "__add__", "__and__", diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index 1d431c6d882..f82e300e83d 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -17,7 +17,7 @@ from abc import abstractmethod from importlib._bootstrap import _ImportLockContext as ImportLock from types import ModuleType -from typing import Any, ContextManager, Dict, NamedTuple, Tuple +from typing import Any, ContextManager, NamedTuple from typing_extensions import Self @@ -377,7 +377,7 @@ class ModuleAccelerator(ModuleAcceleratorBase): attempts to call the fast version first). """ - _denylist: Tuple[str] + _denylist: tuple[str] _use_fast_lib: bool _use_fast_lib_lock: threading.RLock _module_cache_prefix: str = "_slow_lib_" @@ -519,7 +519,7 @@ def disabled(self): def getattr_real_or_wrapped( name: str, *, - real: Dict[str, Any], + real: dict[str, Any], wrapped_objs, loader: ModuleAccelerator, ) -> Any: diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index 0dbd333ce4f..0fb41fc0b26 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import inspect import operator @@ -8,7 +9,6 @@ import sys import time from collections import defaultdict -from typing import Union from rich.console import Console from rich.syntax import Syntax @@ -119,12 +119,10 @@ def __exit__(self, *args, **kwargs): @staticmethod def get_namespaced_function_name( - func_obj: Union[ - _FunctionProxy, - _MethodProxy, - type[_FinalProxy], - type[_IntermediateProxy], - ], + func_obj: _FunctionProxy + | _MethodProxy + | type[_FinalProxy] + | type[_IntermediateProxy], ): if isinstance(func_obj, _MethodProxy): return func_obj._fsproxy_slow.__qualname__ diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 54d38f1a8cf..bf927e661fe 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -1,6 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -from typing import Optional, Union +from __future__ import annotations import pyarrow as pa import pytest @@ -10,7 +9,7 @@ def metadata_from_arrow_array( pa_array: pa.Array, -) -> Optional[plc.interop.ColumnMetadata]: +) -> plc.interop.ColumnMetadata | None: metadata = None if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype): metadata = plc.interop.ColumnMetadata( @@ -25,7 +24,7 @@ def metadata_from_arrow_array( def assert_column_eq( - lhs: Union[pa.Array, plc.Column], rhs: Union[pa.Array, plc.Column] + lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column ) -> None: """Verify that a pylibcudf array and PyArrow array are equal.""" # Nested types require children metadata to be passed to the conversion function. diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 0e38b10ed52..238e8d990cc 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import datetime import io import pathlib -from typing import Optional import fastavro import numpy as np @@ -292,7 +293,7 @@ def test_can_detect_dtypes_from_avro_logical_type( assert_eq(expected, actual) -def get_days_from_epoch(date: Optional[datetime.date]) -> Optional[int]: +def get_days_from_epoch(date: datetime.date | None) -> int | None: if date is None: return None return (date - datetime.date(1970, 1, 1)).days diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 98e9f9881c7..649821b9b7c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9966,6 +9966,20 @@ def test_dataframe_nunique(data): assert_eq(expected, actual) +@pytest.mark.parametrize( + "columns", + [ + pd.RangeIndex(2, name="foo"), + pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]), + pd.Index([3, 5], dtype=np.int8, name="foo"), + ], +) +def test_nunique_preserve_column_in_index(columns): + df = cudf.DataFrame([[1, 2]], columns=columns) + result = df.nunique().index.to_pandas() + assert_eq(result, columns, exact=True) + + @pytest.mark.parametrize( "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index a22b678ebe6..8ce4da792a4 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,6 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations -from typing import Any, Tuple +from typing import Any import cupy as cp import pandas as pd @@ -64,7 +65,7 @@ def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): raise NotImplementedError() -def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): +def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index f143112a45f..7b95e4f9a44 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2162,3 +2162,14 @@ def test_multi_index_contains_hashable(): lfunc_args_and_kwargs=((),), rfunc_args_and_kwargs=((),), ) + + +@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_nunique(array, dropna): + arrays = [array, [3, 4]] + gidx = cudf.MultiIndex.from_arrays(arrays) + pidx = pd.MultiIndex.from_arrays(arrays) + result = gidx.nunique(dropna=dropna) + expected = pidx.nunique(dropna=dropna) + assert result == expected diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 30189e1ac8a..52956c230ba 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2851,3 +2851,13 @@ def test_nans_to_nulls_noop_copies_column(value): ser1 = cudf.Series([value]) ser2 = ser1.nans_to_nulls() assert ser1._column is not ser2._column + + +@pytest.mark.parametrize("dropna", [False, True]) +def test_nunique_all_null(dropna): + data = [None, None] + pd_ser = pd.Series(data) + cudf_ser = cudf.Series(data) + result = pd_ser.nunique(dropna=dropna) + expected = cudf_ser.nunique(dropna=dropna) + assert result == expected diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 913a958b4c2..59b8e6d2e70 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -1,4 +1,5 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import contextlib import importlib @@ -7,7 +8,6 @@ import warnings import weakref from concurrent.futures import ThreadPoolExecutor -from typing import List, Tuple import cupy import numpy as np @@ -107,7 +107,7 @@ def single_column_df_base_data(df: cudf.DataFrame) -> SpillableBuffer: gen_df_data_nbytes = single_column_df()._data._data["a"].data.nbytes -def spilled_and_unspilled(manager: SpillManager) -> Tuple[int, int]: +def spilled_and_unspilled(manager: SpillManager) -> tuple[int, int]: """Get bytes spilled and unspilled known by the manager""" spilled = sum(buf.size for buf in manager.buffers() if buf.is_spilled) unspilled = sum( @@ -661,7 +661,7 @@ def test_statistics(manager: SpillManager): def test_statistics_expose(manager: SpillManager): assert len(manager.statistics.spill_totals) == 0 - buffers: List[SpillableBuffer] = [ + buffers: list[SpillableBuffer] = [ as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) ] @@ -687,7 +687,7 @@ def test_statistics_expose(manager: SpillManager): assert stat.spilled_nbytes == 0 # Create and spill 10 new buffers - buffers: List[SpillableBuffer] = [ + buffers: list[SpillableBuffer] = [ as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) ] diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index d57303ca122..cd7fe5ee023 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -1,7 +1,8 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import functools -from typing import Any, Dict +from typing import Any import cupy as cp from numba import cuda @@ -339,7 +340,7 @@ def chunk_wise_kernel(nrows, chunks, {args}): return kernel -_cache: Dict[Any, Any] = dict() +_cache: dict[Any, Any] = dict() @functools.wraps(_make_row_wise_kernel) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 239438afd24..78aeac425f7 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -1,8 +1,9 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import ast import datetime -from typing import Any, Dict +from typing import Any import numpy as np from numba import cuda @@ -114,7 +115,7 @@ def _check_error(tree): raise QuerySyntaxError("too many expressions") -_cache: Dict[Any, Any] = {} +_cache: dict[Any, Any] = {} def query_compile(expr): diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 95621cf9519..2e4dfc4bb14 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -1,11 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import decimal import functools import os import traceback import warnings -from typing import FrozenSet, Set, Union import numpy as np import pandas as pd @@ -218,7 +218,7 @@ class GetAttrGetItemMixin: # `__setstate__`, but this class may be used in complex multiple # inheritance hierarchies that might also override serialization. The # solution here is a minimally invasive change that avoids such conflicts. - _PROTECTED_KEYS: Union[FrozenSet[str], Set[str]] = frozenset() + _PROTECTED_KEYS: frozenset[str] | set[str] = frozenset() def __getattr__(self, key): if key in self._PROTECTED_KEYS: diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index 39bf07c49de..a75a20a4681 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import inspect from functools import partial diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 03c1db68dbd..0605bba6642 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -688,13 +688,12 @@ def do_evaluate( else pat.obj ) return Column(plc.strings.find.contains(column.obj, pattern)) - else: - assert isinstance(arg, Literal) - prog = plc.strings.regex_program.RegexProgram.create( - arg.value.as_py(), - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, - ) - return Column(plc.strings.contains.contains_re(column.obj, prog)) + assert isinstance(arg, Literal) + prog = plc.strings.regex_program.RegexProgram.create( + arg.value.as_py(), + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + return Column(plc.strings.contains.contains_re(column.obj, prog)) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -725,26 +724,9 @@ def do_evaluate( else prefix.obj, ) ) - else: - columns = [ - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ] - if self.name == pl_expr.StringFunction.Lowercase: - (column,) = columns - return Column(plc.strings.case.to_lower(column.obj)) - elif self.name == pl_expr.StringFunction.Uppercase: - (column,) = columns - return Column(plc.strings.case.to_upper(column.obj)) - elif self.name == pl_expr.StringFunction.EndsWith: - column, suffix = columns - return Column(plc.strings.find.ends_with(column.obj, suffix.obj)) - elif self.name == pl_expr.StringFunction.StartsWith: - column, suffix = columns - return Column(plc.strings.find.starts_with(column.obj, suffix.obj)) - raise NotImplementedError( - f"StringFunction {self.name}" - ) # pragma: no cover; handled by init raising + raise NotImplementedError( + f"StringFunction {self.name}" + ) # pragma: no cover; handled by init raising class Sort(Expr): diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 9fb2468e4e9..7f0920e1b57 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -286,13 +286,18 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: pdf = pl.DataFrame._from_pydf(self.df) if self.projection is not None: pdf = pdf.select(self.projection) - # TODO: goes away when libcudf supports large strings table = pdf.to_arrow() schema = table.schema for i, field in enumerate(schema): + # TODO: Nested types if field.type == pa.large_string(): - # TODO: Nested types + # TODO: goes away when libcudf supports large strings schema = schema.set(i, pa.field(field.name, pa.string())) + elif isinstance(field.type, pa.LargeListType): + # TODO: goes away when libcudf supports large lists + schema = schema.set( + i, pa.field(field.name, pa.list_(field.type.field(0))) + ) table = table.cast(schema) df = DataFrame.from_table( plc.interop.from_arrow(table), list(self.schema.keys()) @@ -850,9 +855,11 @@ class MapFunction(IR): _NAMES: ClassVar[frozenset[str]] = frozenset( [ - "drop_nulls", "rechunk", - "merge_sorted", + # libcudf merge is not stable wrt order of inputs, since + # it uses a priority queue to manage the tables it produces. + # See: https://github.com/rapidsai/cudf/issues/16010 + # "merge_sorted", "rename", "explode", ] @@ -869,46 +876,13 @@ def __post_init__(self) -> None: # polars requires that all to-explode columns have the # same sub-shapes raise NotImplementedError("Explode with more than one column") - elif self.name == "merge_sorted": - assert isinstance(self.df, Union) - (key_column,) = self.options - if key_column not in self.df.dfs[0].schema: - raise ValueError(f"Key column {key_column} not found") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - if self.name == "merge_sorted": - # merge_sorted operates on Union inputs - # but if we evaluate the Union then we can't unpick the - # pieces, so we dive inside and evaluate the pieces by hand - assert isinstance(self.df, Union) - first, *rest = (c.evaluate(cache=cache) for c in self.df.dfs) - (key_column,) = self.options - if not all(first.column_names == r.column_names for r in rest): - raise ValueError("DataFrame shapes/column names don't match") - # Already validated that key_column is in column names - index = first.column_names.index(key_column) - return DataFrame.from_table( - plc.merge.merge_sorted( - [first.table, *(df.table for df in rest)], - [index], - [plc.types.Order.ASCENDING], - [plc.types.NullOrder.BEFORE], - ), - first.column_names, - ).sorted_like(first, subset={key_column}) - elif self.name == "rechunk": + if self.name == "rechunk": # No-op in our data model - return self.df.evaluate(cache=cache) - elif self.name == "drop_nulls": - df = self.df.evaluate(cache=cache) - (subset,) = self.options - subset = set(subset) - indices = [i for i, name in enumerate(df.column_names) if name in subset] - return DataFrame.from_table( - plc.stream_compaction.drop_nulls(df.table, indices, len(indices)), - df.column_names, - ).sorted_like(df) + # Don't think this appears in a plan tree from python + return self.df.evaluate(cache=cache) # pragma: no cover elif self.name == "rename": df = self.df.evaluate(cache=cache) # final tag is "swapping" which is useful for the @@ -924,7 +898,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) else: - raise AssertionError("Should never be reached") + raise AssertionError("Should never be reached") # pragma: no cover @dataclasses.dataclass(slots=True) diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py new file mode 100644 index 00000000000..9bbce6bc080 --- /dev/null +++ b/python/cudf_polars/tests/conftest.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + + +@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session") +def with_nulls(request): + return request.param diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 79018c80bf3..b044bbb2885 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -20,11 +20,6 @@ def dtype(request): return request.param -@pytest.fixture(params=[False, True], ids=["no-nulls", "with-nulls"]) -def with_nulls(request): - return request.param - - @pytest.fixture( params=[ False, diff --git a/python/cudf_polars/tests/expressions/test_distinct.py b/python/cudf_polars/tests/expressions/test_distinct.py index 22865a7ce22..143dd7e9f0f 100644 --- a/python/cudf_polars/tests/expressions/test_distinct.py +++ b/python/cudf_polars/tests/expressions/test_distinct.py @@ -9,11 +9,6 @@ from cudf_polars.testing.asserts import assert_gpu_result_equal -@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"]) -def nullable(request): - return request.param - - @pytest.fixture( params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"] ) @@ -22,9 +17,9 @@ def op(request): @pytest.fixture -def df(nullable): +def df(with_nulls): values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1] - if nullable: + if with_nulls: values[1] = None values[4] = None return pl.LazyFrame({"a": values}) diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 548aebf0875..7eefc59d927 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -29,11 +29,6 @@ def rtype(request): return request.param -@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"]) -def with_nulls(request): - return request.param - - @pytest.fixture( params=[ pl.Expr.eq, diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 198f35d376b..3c498fe7286 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -2,22 +2,39 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from functools import partial + import pytest import polars as pl -from cudf_polars import translate_ir +from cudf_polars import execute_with_cudf, translate_ir from cudf_polars.testing.asserts import assert_gpu_result_equal -def test_supported_stringfunction_expression(): - ldf = pl.LazyFrame( - { - "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"], # noqa: RUF001 - "b": [0, 3, 1, -1, None], - } - ) +@pytest.fixture +def ldf(with_nulls): + a = [ + "AbC", + "de", + "FGHI", + "j", + "kLm", + "nOPq", + "", + "RsT", + "sada", + "uVw", + "h", + "Wıth ünιcοde", # noqa: RUF001 + ] + if with_nulls: + a[4] = None + a[-3] = None + return pl.LazyFrame({"a": a, "b": range(len(a))}) + +def test_supported_stringfunction_expression(ldf): query = ldf.select( pl.col("a").str.starts_with("Z"), pl.col("a").str.ends_with("h").alias("endswith_h"), @@ -27,15 +44,63 @@ def test_supported_stringfunction_expression(): assert_gpu_result_equal(query) -def test_unsupported_stringfunction(): - ldf = pl.LazyFrame( - { - "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"], # noqa: RUF001 - "b": [0, 3, 1, -1, None], - } - ) - +def test_unsupported_stringfunction(ldf): q = ldf.select(pl.col("a").str.count_matches("e", literal=True)) with pytest.raises(NotImplementedError): _ = translate_ir(q._ldf.visit()) + + +def test_contains_re_non_strict_raises(ldf): + q = ldf.select(pl.col("a").str.contains(".", strict=False)) + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +def test_contains_re_non_literal_raises(ldf): + q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False)) + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +@pytest.mark.parametrize( + "substr", + [ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], +) +def test_contains_regex(ldf, substr): + query = ldf.select(pl.col("a").str.contains(substr)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"] +) +def test_contains_literal(ldf, literal): + query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True)) + assert_gpu_result_equal(query) + + +def test_contains_column(ldf): + query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True)) + assert_gpu_result_equal(query) + + +def test_contains_invalid(ldf): + query = ldf.select(pl.col("a").str.contains("[")) + + with pytest.raises(pl.exceptions.ComputeError): + query.collect() + with pytest.raises(pl.exceptions.ComputeError): + query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py new file mode 100644 index 00000000000..ec6b3f3fc0a --- /dev/null +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars import translate_ir +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_merge_sorted_raises(): + df1 = pl.LazyFrame({"a": [1, 6, 9], "b": [1, -10, 4]}) + df2 = pl.LazyFrame({"a": [-1, 5, 11, 20], "b": [2, 7, -4, None]}) + df3 = pl.LazyFrame({"a": [-10, 20, 21], "b": [1, 2, 3]}) + + q = df1.merge_sorted(df2, key="a").merge_sorted(df3, key="a") + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +def test_explode_multiple_raises(): + df = pl.LazyFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]}) + q = df.explode("a", "b") + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +@pytest.mark.parametrize("column", ["a", "b"]) +def test_explode_single(column): + df = pl.LazyFrame( + { + "a": [[1, 2], [3, 4], None], + "b": [[5, 6], [7, 8], [9, 10]], + "c": [None, 11, 12], + } + ) + q = df.explode(column) + + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_string.py b/python/cudf_polars/tests/test_string.py deleted file mode 100644 index f1a080d040f..00000000000 --- a/python/cudf_polars/tests/test_string.py +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: Apache-2.0 -from __future__ import annotations - -from functools import partial - -import pytest - -import polars as pl - -from cudf_polars.callback import execute_with_cudf -from cudf_polars.testing.asserts import assert_gpu_result_equal - - -@pytest.fixture -def ldf(): - return pl.DataFrame( - {"a": ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]} - ).lazy() - - -@pytest.mark.parametrize( - "substr", - [ - "A", - "de", - ".*", - "^a", - "^A", - "[^a-z]", - "[a-z]{3,}", - "^[A-Z]{2,}", - "j|u", - ], -) -def test_contains_regex(ldf, substr): - query = ldf.select(pl.col("a").str.contains(substr)) - assert_gpu_result_equal(query) - - -@pytest.mark.parametrize( - "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"] -) -def test_contains_literal(ldf, literal): - query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True)) - assert_gpu_result_equal(query) - - -def test_contains_column(ldf): - query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True)) - assert_gpu_result_equal(query) - - -@pytest.mark.parametrize("pat", ["["]) -def test_contains_invalid(ldf, pat): - query = ldf.select(pl.col("a").str.contains(pat)) - - with pytest.raises(pl.exceptions.ComputeError): - query.collect() - with pytest.raises(pl.exceptions.ComputeError): - query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index ef47ea436c7..2e72461b43d 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations from functools import wraps -from typing import Set import numpy as np import pandas as pd @@ -695,7 +695,7 @@ def _aggs_optimized(arg, supported: set): """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): if isinstance(arg, dict): - _global_set: Set[str] = set() + _global_set: set[str] = set() for col in arg: if isinstance(arg[col], list): _global_set = _global_set.union(set(arg[col])) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ba8b1e89721..810a804e428 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -316,7 +316,7 @@ def read_partition( if index and (index[0] in df.columns): df = df.set_index(index[0]) - elif index is False and df.index.names != (None,): + elif index is False and df.index.names != [None]: # If index=False, we shouldn't have a named index df.reset_index(inplace=True)