Merge branch 'branch-24.08' of github.com:rapidsai/cudf into branch-2…

…4.08
Matt711 · Jun 17, 2024 · 64931fc · 64931fc
2 parents 68f9cae + 87f6a7e
commit 64931fc
Show file tree

Hide file tree

Showing 73 changed files with 733 additions and 787 deletions.
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -819,7 +820,17 @@ struct operator_functor<ast_operator::NOT, false> {
 template <typename To>
 struct cast {
   static constexpr auto arity{1};
-  template <typename From>
+  template <typename From, typename std::enable_if_t<is_fixed_point<From>()>* = nullptr>
+  __device__ inline auto operator()(From f) -> To
+  {
+    if constexpr (cuda::std::is_floating_point_v<To>) {
+      return convert_fixed_to_floating<To>(f);
+    } else {
+      return static_cast<To>(f);
+    }
+  }
+
+  template <typename From, typename cuda::std::enable_if_t<!is_fixed_point<From>()>* = nullptr>
   __device__ inline auto operator()(From f) -> decltype(static_cast<To>(f))
   {
     return static_cast<To>(f);

diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
@@ -187,7 +187,11 @@ cudf::size_type distinct_count(column_view const& input,
                                nan_policy nan_handling,
                                rmm::cuda_stream_view stream)
 {
-  if (0 == input.size() or input.null_count() == input.size()) { return 0; }
+  if (0 == input.size()) { return 0; }
+
+  if (input.null_count() == input.size()) {
+    return static_cast<size_type>(null_handling == null_policy::INCLUDE);
+  }
 
   auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH"]
+select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"]
 ignore = [
     # whitespace before :
     "E203",

diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
@@ -2,36 +2,34 @@
 
 from __future__ import annotations
 
-from typing import Dict, Optional, Tuple
-
 from typing_extensions import Self
 
 from cudf._typing import Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 
 class Column:
-    _data: Optional[Buffer]
-    _mask: Optional[Buffer]
-    _base_data: Optional[Buffer]
-    _base_mask: Optional[Buffer]
+    _data: Buffer | None
+    _mask: Buffer | None
+    _base_data: Buffer | None
+    _base_mask: Buffer | None
     _dtype: DtypeObj
     _size: int
     _offset: int
     _null_count: int
-    _children: Tuple[ColumnBase, ...]
-    _base_children: Tuple[ColumnBase, ...]
-    _distinct_count: Dict[bool, int]
+    _children: tuple[ColumnBase, ...]
+    _base_children: tuple[ColumnBase, ...]
+    _distinct_count: dict[bool, int]
 
     def __init__(
         self,
-        data: Optional[Buffer],
+        data: Buffer | None,
         size: int,
         dtype: Dtype,
-        mask: Optional[Buffer] = None,
-        offset: Optional[int] = None,
-        null_count: Optional[int] = None,
-        children: Tuple[ColumnBase, ...] = (),
+        mask: Buffer | None = None,
+        offset: int | None = None,
+        null_count: int | None = None,
+        children: tuple[ColumnBase, ...] = (),
     ) -> None: ...
     @property
     def base_size(self) -> int: ...
@@ -40,35 +38,35 @@ class Column:
     @property
     def size(self) -> int: ...
     @property
-    def base_data(self) -> Optional[Buffer]: ...
+    def base_data(self) -> Buffer | None: ...
     @property
-    def data(self) -> Optional[Buffer]: ...
+    def data(self) -> Buffer | None: ...
     @property
     def data_ptr(self) -> int: ...
     def set_base_data(self, value: Buffer) -> None: ...
     @property
     def nullable(self) -> bool: ...
     def has_nulls(self, include_nan: bool = False) -> bool: ...
     @property
-    def base_mask(self) -> Optional[Buffer]: ...
+    def base_mask(self) -> Buffer | None: ...
     @property
-    def mask(self) -> Optional[Buffer]: ...
+    def mask(self) -> Buffer | None: ...
     @property
     def mask_ptr(self) -> int: ...
-    def set_base_mask(self, value: Optional[Buffer]) -> None: ...
-    def set_mask(self, value: Optional[Buffer]) -> Self: ...
+    def set_base_mask(self, value: Buffer | None) -> None: ...
+    def set_mask(self, value: Buffer | None) -> Self: ...
     @property
     def null_count(self) -> int: ...
     @property
     def offset(self) -> int: ...
     @property
-    def base_children(self) -> Tuple[ColumnBase, ...]: ...
+    def base_children(self) -> tuple[ColumnBase, ...]: ...
     @property
-    def children(self) -> Tuple[ColumnBase, ...]: ...
-    def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ...
+    def children(self) -> tuple[ColumnBase, ...]: ...
+    def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: ...
     def _mimic_inplace(
         self, other_col: ColumnBase, inplace=False
-    ) -> Optional[Self]: ...
+    ) -> Self | None: ...
 
     # TODO: The val parameter should be Scalar, not ScalarLike
     @staticmethod

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
@@ -8,7 +8,7 @@
 from collections import abc
 from functools import wraps
 from inspect import isclass
-from typing import List, Union, cast
+from typing import cast
 
 import cupy as cp
 import numpy as np
@@ -219,7 +219,7 @@ def wrapped_func(obj):
 
 
 def _union_categoricals(
-    to_union: List[Union[cudf.Series, cudf.CategoricalIndex]],
+    to_union: list[cudf.Series | cudf.CategoricalIndex],
     sort_categories: bool = False,
     ignore_order: bool = False,
 ):

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -5,7 +5,7 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Literal, Set, Tuple
+from typing import TYPE_CHECKING, Any, Literal
 
 import pandas as pd
 from typing_extensions import Self
@@ -44,11 +44,11 @@
 class BaseIndex(Serializable):
     """Base class for all cudf Index types."""
 
-    _accessors: Set[Any] = set()
+    _accessors: set[Any] = set()
     _data: ColumnAccessor
 
     @property
-    def _columns(self) -> Tuple[Any, ...]:
+    def _columns(self) -> tuple[Any, ...]:
         raise NotImplementedError
 
     @cached_property
@@ -342,9 +342,9 @@ def deserialize(cls, header, frames):
     @property
     def names(self):
         """
-        Returns a tuple containing the name of the Index.
+        Returns a FrozenList containing the name of the Index.
         """
-        return (self.name,)
+        return pd.core.indexes.frozen.FrozenList([self.name])
 
     @names.setter
     def names(self, values):

diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
@@ -1,8 +1,8 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import ast
 import functools
-from typing import List, Tuple
 
 from cudf._lib.expressions import (
     ASTOperator,
@@ -98,9 +98,9 @@ class libcudfASTVisitor(ast.NodeVisitor):
         The column names used to map the names in an expression.
     """
 
-    def __init__(self, col_names: Tuple[str]):
-        self.stack: List[Expression] = []
-        self.nodes: List[Expression] = []
+    def __init__(self, col_names: tuple[str]):
+        self.stack: list[Expression] = []
+        self.nodes: list[Expression] = []
         self.col_names = col_names
 
     @property
@@ -218,7 +218,7 @@ def visit_Call(self, node):
 
 
 @functools.lru_cache(256)
-def parse_expression(expr: str, col_names: Tuple[str]):
+def parse_expression(expr: str, col_names: tuple[str]):
     visitor = libcudfASTVisitor(col_names)
     visitor.visit(ast.parse(expr))
     return visitor
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
@@ -1,20 +1,23 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import os
 import zoneinfo
 from functools import lru_cache
-from typing import Literal, Tuple
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 
 from cudf._lib.timezone import make_timezone_transition_table
 from cudf.core.column.column import as_column
-from cudf.core.column.datetime import DatetimeColumn
-from cudf.core.column.timedelta import TimeDeltaColumn
+
+if TYPE_CHECKING:
+    from cudf.core.column.datetime import DatetimeColumn
+    from cudf.core.column.timedelta import TimeDeltaColumn
 
 
 @lru_cache(maxsize=20)
-def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
+def get_tz_data(zone_name: str) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     """
     Return timezone data (transition times and UTC offsets) for the
     given IANA time zone.
@@ -40,7 +43,7 @@ def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
 
 def _find_and_read_tzfile_tzpath(
     zone_name: str,
-) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
+) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     for search_path in zoneinfo.TZPATH:
         if os.path.isfile(os.path.join(search_path, zone_name)):
             return _read_tzfile_as_columns(search_path, zone_name)
@@ -49,7 +52,7 @@ def _find_and_read_tzfile_tzpath(
 
 def _find_and_read_tzfile_tzdata(
     zone_name: str,
-) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
+) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     import importlib.resources
 
     package_base = "tzdata.zoneinfo"
@@ -78,7 +81,7 @@ def _find_and_read_tzfile_tzdata(
 
 def _read_tzfile_as_columns(
     tzdir, zone_name: str
-) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
+) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     transition_times_and_offsets = make_timezone_transition_table(
         tzdir, zone_name
     )
@@ -92,7 +95,7 @@ def _read_tzfile_as_columns(
 
 def check_ambiguous_and_nonexistent(
     ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
-) -> Tuple[Literal["NaT"], Literal["NaT"]]:
+) -> tuple[Literal["NaT"], Literal["NaT"]]:
     if ambiguous != "NaT":
         raise NotImplementedError(
             "Only ambiguous='NaT' is currently supported"

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
@@ -1,18 +1,17 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import warnings
-from typing import Tuple, Union
+from typing import TYPE_CHECKING
 
 import numpy as np
 
 import cudf
-from cudf._typing import ScalarLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     is_bool_dtype,
     is_scalar,
 )
-from cudf.core.column import ColumnBase
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import (
     _can_cast,
@@ -21,6 +20,10 @@
     is_mixed_with_object_dtype,
 )
 
+if TYPE_CHECKING:
+    from cudf._typing import ScalarLike
+    from cudf.core.column import ColumnBase
+
 
 def _normalize_categorical(input_col, other):
     if isinstance(input_col, cudf.core.column.CategoricalColumn):
@@ -41,9 +44,9 @@ def _normalize_categorical(input_col, other):
 
 def _check_and_cast_columns_with_other(
     source_col: ColumnBase,
-    other: Union[ScalarLike, ColumnBase],
+    other: ScalarLike | ColumnBase,
     inplace: bool,
-) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]:
+) -> tuple[ColumnBase, ScalarLike | ColumnBase]:
     # Returns type-casted `source_col` & `other` based on `inplace`.
     source_dtype = source_col.dtype
     if isinstance(source_dtype, CategoricalDtype):