From ad1369d2d6eabf4b0ae480a10463a74f3034aece Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 5 Sep 2024 01:11:07 +0200 Subject: [PATCH] CI: Test against old versions of key dependencies (#16570) This adds explicit tests with old versions of key dependencies. Specifically: - `numba==0.57` - `numpy==1.23` - `pandas==2.0` - ~`fsspec==0.6.0`~ excluded it. `transformers==4.39.3` requires `huggingface_hub` which requires `fsspec>=2023.5.0`. In principle one could include it e.g. only for conda which doesn't pull in `transformers`, but that seemed not worth the trouble? - `cupy==12.0.0` - `pyarrow==16.1.0` See also https://github.com/rapidsai/build-planning/issues/81 (Marking as draft until I see that things work.) Authors: - Sebastian Berg (https://github.com/seberg) - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/16570 --- ci/cudf_pandas_scripts/run_tests.sh | 13 +- ci/test_python_common.sh | 3 +- ci/test_wheel_cudf.sh | 14 ++ ci/test_wheel_cudf_polars.sh | 11 ++ ci/test_wheel_dask_cudf.sh | 13 ++ dependencies.yaml | 22 +++ .../cudf/cudf/tests/indexes/test_interval.py | 4 + .../test_avro_reader_fastavro_integration.py | 5 + python/cudf/cudf/tests/test_binops.py | 41 +++++- python/cudf/cudf/tests/test_categorical.py | 5 + python/cudf/cudf/tests/test_concat.py | 99 ++++++++----- python/cudf/cudf/tests/test_csv.py | 12 +- python/cudf/cudf/tests/test_dataframe.py | 19 ++- python/cudf/cudf/tests/test_datetime.py | 35 ++++- python/cudf/cudf/tests/test_doctests.py | 5 + python/cudf/cudf/tests/test_groupby.py | 112 +++++++++++++++ python/cudf/cudf/tests/test_index.py | 37 ++++- python/cudf/cudf/tests/test_indexing.py | 8 ++ python/cudf/cudf/tests/test_interpolate.py | 4 + python/cudf/cudf/tests/test_interval.py | 5 + python/cudf/cudf/tests/test_join_order.py | 130 +++++++++++++++++- python/cudf/cudf/tests/test_mvc.py | 8 +- python/cudf/cudf/tests/test_numerical.py | 3 +- python/cudf/cudf/tests/test_orc.py | 8 +- python/cudf/cudf/tests/test_parquet.py | 5 + python/cudf/cudf/tests/test_reductions.py | 5 + python/cudf/cudf/tests/test_replace.py | 20 ++- python/cudf/cudf/tests/test_resampling.py | 9 ++ python/cudf/cudf/tests/test_reshape.py | 17 ++- python/cudf/cudf/tests/test_stats.py | 8 ++ .../cudf_pandas_tests/test_cudf_pandas.py | 12 +- .../dask_cudf/tests/test_applymap.py | 6 + .../dask_cudf/tests/test_distributed.py | 5 + .../dask_cudf/dask_cudf/tests/test_groupby.py | 5 + 34 files changed, 638 insertions(+), 70 deletions(-) diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index 8b85695c861..1c2724a9a5d 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -54,8 +54,19 @@ else RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist - # echo to expand wildcard before adding `[extra]` requires for pip + echo "" > ./constraints.txt + if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then + # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]` + rapids-dependency-file-generator \ + --output requirements \ + --file-key test_python \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee ./constraints.txt + fi + python -m pip install \ + -v \ + --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index e8849588aa5..d0675b0431a 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -14,7 +14,8 @@ ENV_YAML_DIR="$(mktemp -d)" rapids-dependency-file-generator \ --output conda \ --file-key test_python \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee "${ENV_YAML_DIR}/env.yaml" rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 6861d699695..28ded2f8e0f 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -10,8 +10,22 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist +rapids-logger "Install cudf, pylibcudf, and test requirements" + +# Constrain to minimum dependency versions if job is set up as "oldest" +echo "" > ./constraints.txt +if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then + rapids-dependency-file-generator \ + --output requirements \ + --file-key py_test_cudf \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee ./constraints.txt +fi + # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ + -v \ + --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 0baf6c9e277..9844090258a 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -25,9 +25,20 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Installing cudf_polars and its dependencies" +# Constraint to minimum dependency versions if job is set up as "oldest" +echo "" > ./constraints.txt +if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then + rapids-dependency-file-generator \ + --output requirements \ + --file-key py_test_cudf_polars \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee ./constraints.txt +fi # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ + -v \ + --constraint ./constraints.txt \ "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index fa74b2398f7..0d39807d56c 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -11,8 +11,21 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist +rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements" +# Constraint to minimum dependency versions if job is set up as "oldest" +echo "" > ./constraints.txt +if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then + rapids-dependency-file-generator \ + --output requirements \ + --file-key py_test_dask_cudf \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + | tee ./constraints.txt +fi + # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install \ + -v \ + --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ diff --git a/dependencies.yaml b/dependencies.yaml index c6851d9cb90..f8b231efd6d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -696,6 +696,28 @@ dependencies: - pytest<8 - pytest-cov - pytest-xdist + specific: + # Define additional constraints for testing with oldest dependencies. + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numba==0.57.* + - numpy==1.23.* + - pandas==2.0.* + - pyarrow==14.0.0 + - cupy==12.0.0 # ignored as pip constraint + - matrix: + packages: + - output_types: requirements + # Using --constraints for pip install, so we list cupy multiple times + matrices: + - matrix: {dependencies: "oldest"} + packages: + - cupy-cuda11x==12.0.0 + - cupy-cuda12x==12.0.0 + - matrix: + packages: test_python_pylibcudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py index 6653a94c9be..25edf788daf 100644 --- a/python/cudf/cudf/tests/indexes/test_interval.py +++ b/python/cudf/cudf/tests/indexes/test_interval.py @@ -149,6 +149,10 @@ def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t): assert_eq(pindex, gindex) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) def test_interval_range_periods_warnings(): start_val, end_val, periods_val = 0, 4, 1.0 diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 2ec1d1d2f28..9d69e626c3d 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -23,6 +23,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -302,6 +303,10 @@ def get_days_from_epoch(date: datetime.date | None) -> int | None: @pytest.mark.parametrize("namespace", [None, "root_ns"]) @pytest.mark.parametrize("nullable", [True, False]) @pytest.mark.parametrize("prepend_null", [True, False]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas (datetime(9999, ...) too large)", +) def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null): avro_type = {"logicalType": "date", "type": "int"} if nullable: diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 4256ec872e6..2e8519509e2 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -13,7 +13,11 @@ import cudf from cudf import Index, Series -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.buffer.spill_manager import get_global_manager from cudf.testing import _utils as utils, assert_eq from cudf.utils.dtypes import ( @@ -1781,6 +1785,20 @@ def test_datetime_dateoffset_binaryop( reason="https://github.com/pandas-dev/pandas/issues/57448", ) ) + if ( + not PANDAS_GE_220 + and dtype in {"datetime64[ms]", "datetime64[s]"} + and frequency in ("microseconds", "nanoseconds") + and n_periods != 0 + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") + if ( + not PANDAS_GE_220 + and dtype == "datetime64[us]" + and frequency == "nanoseconds" + and n_periods != 0 + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") date_col = [ f"2000-01-01 00:00:{components}", @@ -1834,7 +1852,11 @@ def test_datetime_dateoffset_binaryop( "ignore:Discarding nonzero nanoseconds:UserWarning" ) @pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_datetime_dateoffset_binaryop_multiple(request, date_col, kwargs, op): gsr = cudf.Series(date_col, dtype="datetime64[ns]") psr = gsr.to_pandas() @@ -1873,6 +1895,21 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): def test_datetime_dateoffset_binaryop_reflected( n_periods, frequency, dtype, components ): + if ( + not PANDAS_GE_220 + and dtype in {"datetime64[ms]", "datetime64[s]"} + and frequency in ("microseconds", "nanoseconds") + and n_periods != 0 + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") + if ( + not PANDAS_GE_220 + and dtype == "datetime64[us]" + and frequency == "nanoseconds" + and n_periods != 0 + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") + date_col = [ f"2000-01-01 00:00:{components}", f"2000-01-31 00:00:{components}", diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index ae58af8ebce..cd1ad21ae59 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -11,6 +11,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal @@ -858,6 +859,10 @@ def test_cat_from_scalar(scalar): assert_eq(ps, gs) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) def test_cat_groupby_fillna(): ps = pd.Series(["a", "b", "c"], dtype="category") gs = cudf.from_pandas(ps) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index c1c03de48d4..8da589ba45b 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -9,6 +9,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_220 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal, expect_warning_if @@ -451,45 +452,75 @@ def test_concat_mixed_input(): [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})], [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})], [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})], - [ - pd.Series([1, 2, 3.0, 1.2], name="abc"), - pd.DataFrame({"a": [1, 2]}), - ], - [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] - ), - pd.DataFrame({"a": [1, 2]}), - ], - [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] + pytest.param( + [ + pd.Series([1, 2, 3.0, 1.2], name="abc"), + pd.DataFrame({"a": [1, 2]}), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), - ], - [ - pd.Series( - [1, 2, 3.0, 1.2, 8, 100], - name="New name", - index=["a", "b", "c", "d", "e", "f"], + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] + ), + pd.DataFrame({"a": [1, 2]}), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - pd.DataFrame( - {"a": [1, 2, 4, 10, 11, 12]}, - index=["a", "b", "c", "d", "e", "f"], + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] + ), + pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - ], - [ - pd.Series( - [1, 2, 3.0, 1.2, 8, 100], - name="New name", - index=["a", "b", "c", "d", "e", "f"], + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2, 8, 100], + name="New name", + index=["a", "b", "c", "d", "e", "f"], + ), + pd.DataFrame( + {"a": [1, 2, 4, 10, 11, 12]}, + index=["a", "b", "c", "d", "e", "f"], + ), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - pd.DataFrame( - {"a": [1, 2, 4, 10, 11, 12]}, - index=["a", "b", "c", "d", "e", "f"], + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2, 8, 100], + name="New name", + index=["a", "b", "c", "d", "e", "f"], + ), + pd.DataFrame( + {"a": [1, 2, 4, 10, 11, 12]}, + index=["a", "b", "c", "d", "e", "f"], + ), + ] + * 7, + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", ), - ] - * 7, + ), ], ) def test_concat_series_dataframe_input(objs): diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 40ba415e681..cee3d23eadc 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -16,9 +16,13 @@ import cudf from cudf import read_csv -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal +from cudf.testing._utils import assert_exceptions_equal, expect_warning_if def make_numeric_dataframe(nrows, dtype): @@ -1270,14 +1274,14 @@ def test_csv_reader_delim_whitespace(): # with header row with pytest.warns(FutureWarning): cu_df = read_csv(StringIO(buffer), delim_whitespace=True) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220): pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True) assert_eq(pd_df, cu_df) # without header row with pytest.warns(FutureWarning): cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220): pd_df = pd.read_csv( StringIO(buffer), delim_whitespace=True, header=None ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 9122a1074ac..f4d1578bda7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -26,7 +26,11 @@ import cudf from cudf.api.extensions import no_default -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column import column from cudf.errors import MixedTypeError @@ -3561,8 +3565,11 @@ def test_dataframe_empty_sort_index(): @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) def test_dataframe_sort_index( - index, axis, ascending, inplace, ignore_index, na_position + request, index, axis, ascending, inplace, ignore_index, na_position ): + if not PANDAS_GE_220 and axis in (1, "columns") and ignore_index: + pytest.skip(reason="Bug fixed in pandas-2.2") + pdf = pd.DataFrame( {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, index=index, @@ -3612,6 +3619,10 @@ def test_dataframe_sort_index( @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("na_position", ["first", "last"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_dataframe_mulitindex_sort_index( request, axis, level, ascending, inplace, ignore_index, na_position ): @@ -6747,6 +6758,10 @@ def test_dataframe_init_from_arrays_cols(data, cols, index): None, ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_dataframe_assign_scalar(request, col_data, assign_val): request.applymarker( pytest.mark.xfail( diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 7be4faa42c3..4a2345fc009 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -14,7 +14,11 @@ import cudf import cudf.testing.dataset_generator as dataset_generator from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.index import DatetimeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -801,6 +805,10 @@ def test_to_datetime_different_formats_notimplemented(): cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas.", +) def test_datetime_can_cast_safely(): sr = cudf.Series( ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]" @@ -847,6 +855,10 @@ def test_datetime_array_timeunit_cast(dtype): @pytest.mark.parametrize("timeunit", ["D", "W", "M", "Y"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_datetime_scalar_timeunit_cast(timeunit): testscalar = np.datetime64("2016-11-20", timeunit) @@ -1535,6 +1547,10 @@ def test_date_range_start_end_periods(start, end, periods): ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_date_range_start_end_freq(start, end, freq): if isinstance(freq, str): _gfreq = _pfreq = freq @@ -1551,6 +1567,10 @@ def test_date_range_start_end_freq(start, end, freq): ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_date_range_start_freq_periods(start, freq, periods): if isinstance(freq, str): _gfreq = _pfreq = freq @@ -1643,6 +1663,9 @@ def test_date_range_raise_overflow(): ], ) def test_date_range_raise_unsupported(freqstr_unsupported): + if not PANDAS_GE_220 and freqstr_unsupported.endswith("E"): + pytest.skip(reason="YE, etc. support was added in pandas 2.2") + s, e = "2001-01-01", "2008-01-31" pd.date_range(start=s, end=e, freq=freqstr_unsupported) with pytest.raises(ValueError, match="does not yet support"): @@ -1654,7 +1677,7 @@ def test_date_range_raise_unsupported(freqstr_unsupported): if freqstr_unsupported != "3MS": freqstr_unsupported = freqstr_unsupported.lower() with pytest.raises(ValueError, match="does not yet support"): - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220): cudf.date_range(start=s, end=e, freq=freqstr_unsupported) @@ -1995,6 +2018,10 @@ def test_first(idx, offset): ) ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) def test_first_start_at_end_of_month(idx, offset): p = pd.Series(range(len(idx)), index=idx) g = cudf.from_pandas(p) @@ -2319,6 +2346,10 @@ def test_datetime_to_str(data, dtype): assert_eq(actual.to_pandas(nullable=True), expected) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_datetime_string_to_datetime_resolution_loss_raises(): data = ["2020-01-01 00:00:00.00001"] dtype = "datetime64[s]" diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 794660cffcb..5d3d18cbe95 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -11,6 +11,7 @@ from packaging import version import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION pytestmark = pytest.mark.filterwarnings("ignore::FutureWarning") @@ -96,6 +97,10 @@ def prinoptions(cls): itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]), ids=lambda docstring: docstring.name, ) + @pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Doctests not expected to pass on older versions of pandas", + ) def test_docstring(self, docstring): # We ignore differences in whitespace in the doctest output, and enable # the use of an ellipsis "..." to match any string in the doctest diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 74f04c0584f..0aaa71e50d7 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -188,6 +188,10 @@ def test_groupby_as_index_single_agg(pdf, gdf, as_index): @pytest.mark.parametrize("engine", ["cudf", "jit"]) @pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_as_index_apply(pdf, gdf, as_index, engine): gdf = gdf.groupby("y", as_index=as_index).apply( lambda df: df["x"].mean(), engine=engine @@ -298,6 +302,10 @@ def assert_values_equal(arr): assert_values_equal(pddf[k].values) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply(): np.random.seed(0) df = DataFrame() @@ -338,6 +346,10 @@ def f3(df, k, L, m): @pytest.mark.parametrize("func,args", create_test_groupby_apply_args_params()) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_args(func, args): np.random.seed(0) df = DataFrame() @@ -500,6 +512,10 @@ def func(df): "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] ) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_apply_jit_unary_reductions( func, dtype, dataset, groupby_jit_datasets ): @@ -530,6 +546,10 @@ def func(df): # test unary index reductions for special values +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def groupby_apply_jit_idx_reductions_special_vals_inner( func, data, dtype, special_val ): @@ -555,6 +575,10 @@ def func(df): @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_apply_jit_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): @@ -583,6 +607,10 @@ def test_groupby_apply_jit_reductions_special_vals( ], ) @pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="include_groups keyword new in pandas 2.2", +) def test_groupby_apply_jit_idx_reductions_special_vals( func, dtype, dataset, groupby_jit_datasets, special_val ): @@ -593,6 +621,10 @@ def test_groupby_apply_jit_idx_reductions_special_vals( @pytest.mark.parametrize("dtype", ["int32"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_sum_integer_overflow(dtype): max = np.iinfo(dtype).max @@ -627,6 +659,10 @@ def func(group): "large", ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): dataset = groupby_jit_datasets[dataset] @@ -653,6 +689,10 @@ def func(group): @pytest.mark.parametrize("dtype", ["int32", "int64"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_correlation_zero_variance(dtype): # pearson correlation is undefined when the variance of either # variable is zero. This test ensures that the jit implementation @@ -711,6 +751,10 @@ def func(group): @pytest.mark.parametrize("dtype", ["uint8", "str"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_unsupported_dtype(dtype): df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df["b"] = df["b"].astype(dtype) @@ -739,6 +783,10 @@ def func(group): lambda df: df["val1"].mean() + df["val2"].std(), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_basic(func, groupby_jit_data_small): run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) @@ -759,12 +807,20 @@ def f3(df, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_jit_args_params() ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): run_groupby_apply_jit_test( groupby_jit_data_small, func, ["key1", "key2"], *args ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_jit_block_divergence(): # https://github.com/rapidsai/cudf/issues/12686 df = cudf.DataFrame( @@ -782,6 +838,10 @@ def diverging_block(grp_df): run_groupby_apply_jit_test(df, diverging_block, ["a"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_caching(): # Make sure similar functions that differ # by simple things like constants actually @@ -818,6 +878,10 @@ def f(group): assert precompiled.currsize == 3 +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_no_bytecode_fallback(): # tests that a function which contains no bytecode # attribute, but would still be executable using @@ -836,6 +900,10 @@ def f(group): assert_groupby_results_equal(expect, got) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_return_col_from_df(): # tests a UDF that consists of purely colwise # ops, such as `lambda group: group.x + group.y` @@ -862,6 +930,10 @@ def func(df): @pytest.mark.parametrize("func", [lambda group: group.sum()]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_return_df(func): # tests a UDF that reduces over a dataframe # and produces a series with the original column names @@ -1940,6 +2012,10 @@ def test_groupby_agg_combinations(agg): ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_apply_noempty_group(): pdf = pd.DataFrame( {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} @@ -2208,6 +2284,10 @@ def f3(x, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_return_scalars_params() ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_apply_return_scalars(func, args): pdf = pd.DataFrame( { @@ -2266,6 +2346,10 @@ def f5(x, k, L, m): @pytest.mark.parametrize( "func,args", create_test_groupby_apply_return_series_dataframe_params() ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) def test_groupby_apply_return_series_dataframe(func, args): pdf = pd.DataFrame( {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} @@ -2744,6 +2828,10 @@ def test_groupby_diff_row_zero_shift(nelem): # TODO: test for category columns when cudf.Scalar supports category type @pytest.mark.parametrize("nelem", [10, 100, 1000]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) def test_groupby_fillna_multi_value(nelem): t = rand_dataframe( dtypes_meta=[ @@ -2790,6 +2878,10 @@ def test_groupby_fillna_multi_value(nelem): # TODO: test for category columns when cudf.Scalar supports category type # TODO: cudf.fillna does not support decimal column to column fill yet @pytest.mark.parametrize("nelem", [10, 100, 1000]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) def test_groupby_fillna_multi_value_df(nelem): t = rand_dataframe( dtypes_meta=[ @@ -2843,6 +2935,10 @@ def test_groupby_fillna_multi_value_df(nelem): "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]] ) @pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) def test_groupby_various_by_fillna(by, data, args): ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -3146,6 +3242,10 @@ def test_groupby_freq_s(label, closed): ), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warnings only given on newer versions.", +) def test_groupby_get_group(pdf, group, name, obj): gdf = cudf.from_pandas(pdf) @@ -3644,6 +3744,10 @@ def test_group_by_pandas_sort_order(groups, sort): "last", ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_group_by_empty_reduction(dtype, reduce_op): gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype) pdf = gdf.to_pandas() @@ -3664,6 +3768,10 @@ def test_group_by_empty_reduction(dtype, reduce_op): "apply_op", ["sum", "min", "max", "idxmax"], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_group_by_empty_apply(request, dtype, apply_op): request.applymarker( pytest.mark.xfail( @@ -3719,6 +3827,10 @@ def test_groupby_consecutive_operations(): assert_groupby_results_equal(actual, expected, check_dtype=False) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning only given on newer versions.", +) def test_categorical_grouping_pandas_compatibility(): gdf = cudf.DataFrame( { diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 722a64cb553..3f483219423 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -16,6 +16,11 @@ import cudf from cudf.api.extensions import no_default +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -791,9 +796,27 @@ def test_index_to_series(data): "name_data,name_other", [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_index_difference(data, other, sort, name_data, name_other): pd_data = pd.Index(data, name=name_data) pd_other = pd.Index(other, name=name_other) + if ( + not PANDAS_GE_220 + and isinstance(pd_data.dtype, pd.CategoricalDtype) + and not isinstance(pd_other.dtype, pd.CategoricalDtype) + and pd_other.isnull().any() + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/issues/57318") + + if ( + not PANDAS_GE_220 + and len(pd_other) == 0 + and len(pd_data) != len(pd_data.unique()) + ): + pytest.skip(reason="Bug fixed in pandas-2.2+") gd_data = cudf.from_pandas(pd_data) gd_other = cudf.from_pandas(pd_other) @@ -1017,6 +1040,10 @@ def test_index_equal_misc(data, other): ["abcd", "defgh", "werty", "poiu"], ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) def test_index_append(data, other): pd_data = pd.Index(data) pd_other = pd.Index(other) @@ -1220,6 +1247,10 @@ def test_index_append_error(data, other): ), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) def test_index_append_list(data, other): pd_data = data pd_other = other @@ -2084,6 +2115,10 @@ def test_get_indexer_multi_numeric_deviate(key, method): @pytest.mark.parametrize("method", ["ffill", "bfill"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_get_indexer_multi_error(method): pi = pd.MultiIndex.from_tuples( [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] @@ -2527,7 +2562,7 @@ def test_isin_index(index, values): ) with expect_warning_if(is_dt_str): got = gidx.isin(values) - with expect_warning_if(is_dt_str): + with expect_warning_if(PANDAS_GE_220 and is_dt_str): expected = pidx.isin(values) assert_eq(got, expected) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 9df2852dde8..00ae99466bb 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1016,6 +1016,10 @@ def test_series_setitem_iloc(key, value, nulls): (slice(0, 2), [0.5, 0.25]), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_series_setitem_dtype(key, value): psr = pd.Series([1, 2, 3], dtype="int32") gsr = cudf.from_pandas(psr) @@ -1634,6 +1638,10 @@ def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe( assert_eq(expected, actual) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="No warning in older versions of pandas", +) def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns(): gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) pdf = gdf.to_pandas() diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index a4f0b9fc97e..c76a49103e2 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -125,6 +125,10 @@ def test_interpolate_series_values_or_index(data, index, method): ), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not fail on older versions of pandas", +) def test_interpolate_dataframe_error_cases(data, kwargs): gsr = cudf.DataFrame(data) psr = gsr.to_pandas() diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 2d194107658..5e1dd33fbf1 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -6,6 +6,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_220 from cudf.testing import assert_eq @@ -168,6 +169,10 @@ def test_interval_index_unique(): @pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex]) @pytest.mark.parametrize("tz", ["US/Eastern", None]) +@pytest.mark.skipif( + condition=not PANDAS_GE_220, + reason="ME frequency new in pandas 2.2", +) def test_interval_with_datetime(tz, box): dti = pd.date_range( start=pd.Timestamp("20180101", tz=tz), diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py index 9ea4ba007d2..9a95f0e01ab 100644 --- a/python/cudf/cudf/tests/test_join_order.py +++ b/python/cudf/cudf/tests/test_join_order.py @@ -1,13 +1,19 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. import itertools +import operator import string +from collections import defaultdict import numpy as np import pytest import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.testing import assert_eq @@ -35,10 +41,124 @@ def right(): # Behaviour in sort=False case didn't match documentation in many # cases prior to https://github.com/pandas-dev/pandas/pull/54611 # (released as part of pandas 2.2) -def expected(left, right, sort, *, how): - left = left.to_pandas() - right = right.to_pandas() - return left.merge(right, on="key", how=how, sort=sort) +if PANDAS_GE_220: + # Behaviour in sort=False case didn't match documentation in many + # cases prior to https://github.com/pandas-dev/pandas/pull/54611 + # (released as part of pandas 2.2) + def expected(left, right, sort, *, how): + left = left.to_pandas() + right = right.to_pandas() + return left.merge(right, on="key", how=how, sort=sort) + +else: + + def expect_inner(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val): + if k not in right_have: + continue + for i in right_have[k]: + keys.append(k) + val_x.append(v) + val_y.append(right_val[i]) + + if sort: + # Python sort is stable, so this will preserve input order for + # equal items. + keys, val_x, val_y = zip( + *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expect_left(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val): + if k not in right_have: + right_vals = [None] + else: + right_vals = [right_val[i] for i in right_have[k]] + + for rv in right_vals: + keys.append(k) + val_x.append(v) + val_y.append(rv) + + if sort: + # Python sort is stable, so this will preserve input order for + # equal items. + keys, val_x, val_y = zip( + *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expect_outer(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val): + if k not in right_have: + right_vals = [None] + else: + right_vals = [right_val[i] for i in right_have[k]] + for rv in right_vals: + keys.append(k) + val_x.append(v) + val_y.append(rv) + left_have = set(left_key) + for k, v in zip(right_key, right_val): + if k not in left_have: + keys.append(k) + val_x.append(None) + val_y.append(v) + + # Python sort is stable, so this will preserve input order for + # equal items. + # outer joins are always sorted, but we test both sort values + keys, val_x, val_y = zip( + *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expected(left, right, sort, *, how): + if how == "inner": + return expect_inner(left, right, sort) + elif how == "outer": + return expect_outer(left, right, sort) + elif how == "left": + return expect_left(left, right, sort) + elif how == "right": + return expect_left(right, left, sort).rename( + {"val_x": "val_y", "val_y": "val_x"}, axis=1 + ) + else: + raise NotImplementedError() @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"]) diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py index 7dd25ebc500..055bc5757b3 100644 --- a/python/cudf/cudf/tests/test_mvc.py +++ b/python/cudf/cudf/tests/test_mvc.py @@ -1,8 +1,9 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. import subprocess import sys import pytest +from packaging import version IS_CUDA_11 = False IS_CUDA_12 = False @@ -14,9 +15,12 @@ # do not test cuda 12 if pynvjitlink isn't present HAVE_PYNVJITLINK = False try: + import numba import pynvjitlink # noqa: F401 - HAVE_PYNVJITLINK = True + HAVE_PYNVJITLINK = version.parse(numba.__version__) >= version.parse( + "0.58" + ) except ModuleNotFoundError: pass diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 1b0589254f5..b1a2f081cd2 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -5,6 +5,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_220 from cudf.testing import assert_eq from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes @@ -373,7 +374,7 @@ def test_to_numeric_error(data, errors): ): cudf.to_numeric(data, errors=errors) else: - with expect_warning_if(errors == "ignore"): + with expect_warning_if(PANDAS_GE_220 and errors == "ignore"): expect = pd.to_numeric(data, errors=errors) with expect_warning_if(errors == "ignore"): got = cudf.to_numeric(data, errors=errors) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index e0884a5819a..c2a30b76bea 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1679,7 +1679,13 @@ def run_orc_columns_and_index_param(index_obj, index, columns): "columns", [ None, - [], + pytest.param( + [], + marks=pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Bug in older version of pandas", + ), + ), ], ) def test_orc_columns_and_index_param(index_obj, index, columns): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 6623c537ddf..8b59a7eef08 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -23,6 +23,7 @@ import cudf from cudf._lib.parquet import read_parquet_chunked +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.io.parquet import ( ParquetDatasetWriter, ParquetWriter, @@ -3034,6 +3035,10 @@ def test_parquet_reader_rle_boolean(datadir): # a list column in a schema, the cudf reader was confusing # nesting information between a list column and a subsequent # string column, ultimately causing a crash. +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Older versions of pandas do not have DataFrame.map()", +) def test_parquet_reader_one_level_list2(datadir): # we are reading in a file containing binary types, but cudf returns # those as strings. so we have to massage the pandas data to get diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index a70a2ea15dd..f276f394cd0 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -10,6 +10,7 @@ import cudf from cudf import Series +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import _utils as utils, assert_eq from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand @@ -342,6 +343,10 @@ def test_any_all_axis_none(data, op): "median", ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning not given on older versions of pandas", +) def test_reductions_axis_none_warning(op): df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]}) pdf = df.to_pandas() diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index e5ee0127a74..3a8928297c0 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -10,7 +10,11 @@ import pytest import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -66,7 +70,7 @@ def test_series_replace_all(gsr, to_replace, value): ) with expect_warning_if(expect_warn): actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) - with expect_warning_if(expect_warn): + with expect_warning_if(expect_warn and PANDAS_GE_220): if pd_value is None: # TODO: Remove this workaround once cudf # introduces `no_default` values @@ -91,7 +95,7 @@ def test_series_replace(): # Categorical psr3 = pd.Series(["one", "two", "three"], dtype="category") - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220, FutureWarning): psr4 = psr3.replace("one", "two") sr3 = cudf.from_pandas(psr3) with pytest.warns(FutureWarning): @@ -100,7 +104,7 @@ def test_series_replace(): psr4.sort_values().reset_index(drop=True), sr4.sort_values().reset_index(drop=True), ) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220, FutureWarning): psr5 = psr3.replace("one", "five") with pytest.warns(FutureWarning): sr5 = sr3.replace("one", "five") @@ -517,7 +521,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace): pd.date_range( "2010-01-01", "2020-01-10", - freq="1YE", + freq="1YE" if PANDAS_GE_220 else "1y", ) ), pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), @@ -564,7 +568,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace): pd.date_range( "2010-01-01", "2020-01-10", - freq="1YE", + freq="1YE" if PANDAS_GE_220 else "1y", ) ) + pd.Timedelta("1d"), @@ -1069,6 +1073,10 @@ def test_numeric_series_replace_dtype(series_dtype, replacement): ), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning not given on older versions of pandas", +) def test_replace_inplace(pframe, replace_args): gpu_frame = cudf.from_pandas(pframe) pandas_frame = pframe.copy() diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index 95fa8e9a50a..a61477981f8 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -5,6 +5,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq @@ -147,6 +148,10 @@ def test_dataframe_resample_level(): ("10D", "1D", "s"), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq): # test that we cast to the appropriate frequency # when resampling: @@ -164,6 +169,10 @@ def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq): assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]") +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_resampling_downsampling_ms(): pdf = pd.DataFrame( { diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 50db4302b75..4235affd4d1 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -8,10 +8,19 @@ import pytest import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.buffer.spill_manager import get_global_manager from cudf.testing import assert_eq -from cudf.testing._utils import ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES +from cudf.testing._utils import ( + ALL_TYPES, + DATETIME_TYPES, + NUMERIC_TYPES, + expect_warning_if, +) pytest_xfail = pytest.mark.xfail pytestmark = pytest.mark.spilling @@ -220,7 +229,7 @@ def test_df_stack_multiindex_column_axis(columns, index, level, dropna): with pytest.warns(FutureWarning): got = gdf.stack(level=level, dropna=dropna, future_stack=False) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220, FutureWarning): expect = pdf.stack(level=level, dropna=dropna, future_stack=False) assert_eq(expect, got, check_dtype=False) @@ -265,7 +274,7 @@ def test_df_stack_multiindex_column_axis_pd_example(level): df = pd.DataFrame(np.random.randn(4, 4), columns=columns) - with pytest.warns(FutureWarning): + with expect_warning_if(PANDAS_GE_220, FutureWarning): expect = df.stack(level=level, future_stack=False) gdf = cudf.from_pandas(df) with pytest.warns(FutureWarning): diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index d5f63fdab77..f952cea07f8 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -447,6 +447,10 @@ def test_cov1d(data1, data2): ], ) @pytest.mark.parametrize("method", ["spearman", "pearson"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warnings missing on older pandas (scipy version seems unrelated?)", +) def test_corr1d(data1, data2, method): if method == "spearman": # Pandas uses scipy.stats.spearmanr code-path @@ -585,6 +589,10 @@ def test_min_count_ops(data, ops, skipna, min_count): ], ) @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_cov_corr_datetime_timedelta(data1, data2, dtype): gsr1 = cudf.Series(data1, dtype=dtype) gsr2 = cudf.Series(data2, dtype=dtype) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 505d5d0b9cc..d10c531d757 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -23,6 +23,7 @@ from numba import NumbaDeprecationWarning from pytz import utc +from cudf.core._compat import PANDAS_GE_220 from cudf.pandas import LOADED, Profiler from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object @@ -536,12 +537,15 @@ def test_array_ufunc(series): @pytest.mark.xfail(strict=False, reason="Fails in CI, passes locally.") def test_groupby_apply_func_returns_series(dataframe): pdf, df = dataframe + if PANDAS_GE_220: + kwargs = {"include_groups": False} + else: + kwargs = {} + expect = pdf.groupby("a").apply( - lambda group: pd.Series({"x": 1}), include_groups=False - ) - got = df.groupby("a").apply( - lambda group: xpd.Series({"x": 1}), include_groups=False + lambda group: pd.Series({"x": 1}), **kwargs ) + got = df.groupby("a").apply(lambda group: xpd.Series({"x": 1}), **kwargs) tm.assert_equal(expect, got) diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py index d84235481c3..e4e79b7b8cf 100644 --- a/python/dask_cudf/dask_cudf/tests/test_applymap.py +++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py @@ -5,6 +5,8 @@ from dask import dataframe as dd +from cudf.core._compat import PANDAS_GE_210 + from dask_cudf.tests.utils import _make_random_frame @@ -18,6 +20,10 @@ ], ) @pytest.mark.parametrize("has_na", [True, False]) +@pytest.mark.skipif( + not PANDAS_GE_210, + reason="DataFrame.map requires pandas>=2.1.0", +) def test_applymap_basic(func, has_na): size = 2000 pdf, dgdf = _make_random_frame(size, include_na=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index be10b0d4843..d03180852eb 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -80,6 +80,11 @@ def test_str_series_roundtrip(): def test_p2p_shuffle(): + pytest.importorskip( + "pyarrow", + minversion="14.0.1", + reason="P2P shuffling requires pyarrow>=14.0.1", + ) # Check that we can use `shuffle_method="p2p"` with dask_cuda.LocalCUDACluster(n_workers=1) as cluster: with Client(cluster): diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index cf916b713b2..7b9f0ca328a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -9,6 +9,7 @@ from dask.utils_test import hlg_layer import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing._utils import expect_warning_if import dask_cudf @@ -316,6 +317,10 @@ def test_groupby_dropna_cudf(dropna, by): (None, ["a", "d"]), ], ) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) def test_groupby_dropna_dask(dropna, by): # NOTE: This test is borrowed from upstream dask # (dask/dask/dataframe/tests/test_groupby.py)