Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make tests more deterministic #17008

Open
wants to merge 30 commits into
base: branch-24.12
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8b462e8
use nep rule
galipremsagar Oct 4, 2024
763ae35
update pre-commit config
galipremsagar Oct 4, 2024
027e148
first pass
galipremsagar Oct 5, 2024
828706e
Switch numpy random calls to latest API
galipremsagar Oct 7, 2024
40ad066
Merge remote-tracking branch 'upstream/branch-24.12' into numpy_random
galipremsagar Oct 7, 2024
858afbb
improve
galipremsagar Oct 7, 2024
16e61b2
Merge branch 'branch-24.12' into numpy_random
galipremsagar Oct 7, 2024
66edb86
Merge remote-tracking branch 'upstream/branch-24.12' into numpy_random
galipremsagar Oct 11, 2024
ebaa562
more fixes
galipremsagar Oct 11, 2024
e100e2d
fix issues
galipremsagar Oct 11, 2024
33234fc
fix more failures
galipremsagar Oct 12, 2024
5915e5b
update pre-commit
galipremsagar Oct 12, 2024
7087da9
Merge remote-tracking branch 'upstream/branch-24.12' into numpy_random
galipremsagar Oct 12, 2024
3d35be1
fix default seed
galipremsagar Oct 12, 2024
8336a5f
style
galipremsagar Oct 12, 2024
cd7e198
update files
galipremsagar Oct 14, 2024
bc73f31
update
galipremsagar Oct 14, 2024
4e81055
Merge remote-tracking branch 'upstream/branch-24.12' into numpy_random
galipremsagar Oct 14, 2024
0939ba1
update
galipremsagar Oct 14, 2024
8f4efb7
update
galipremsagar Oct 14, 2024
de0813c
update notebook
galipremsagar Oct 14, 2024
3158d9c
Apply suggestions from code review
galipremsagar Oct 15, 2024
a02907b
address reviews
galipremsagar Oct 15, 2024
56f8ee0
Merge branch 'numpy_random' of https://github.com/galipremsagar/cudf …
galipremsagar Oct 15, 2024
4f3ca74
Merge remote-tracking branch 'upstream/branch-24.12' into numpy_random
galipremsagar Oct 15, 2024
4a9e944
merge into one
galipremsagar Oct 15, 2024
b8f964b
address reviews
galipremsagar Oct 15, 2024
5a7afc6
Merge remote-tracking branch 'upstream/branch-24.12' into numpy_random
galipremsagar Oct 16, 2024
2350a46
fix struct data type corruption
galipremsagar Oct 16, 2024
ab1ddda
Merge branch 'branch-24.12' into numpy_random
galipremsagar Oct 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,18 @@ repos:
entry: 'pytest\.xfail'
language: pygrep
types: [python]
- id: no-unseeded-default-rng
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a preference to keep these checks separated? Seems like we could do something similar to the check that @mroeschke linked in consolidating all of these entries into a single check that runs against all Python files at once

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, I quite like the separate entries, since the name of the entry gives some information as to what went wrong. If the regex gets complicated, I find it hard to see what's going on. But I don't have strong feelings here.

name: no-unseeded-default-rng
description: 'Enforce that no non-seeded default_rng is used'
entry: 'default_rng\(\)'
language: pygrep
types: [python]
- id: no-np-random-seed
name: no-np-random-seed
description: 'Enforce that default_rng is used instead of np.random.seed'
entry: 'random.seed\('
language: pygrep
types: [python]
- id: cmake-format
name: cmake-format
entry: ./cpp/scripts/run-cmake-format.sh cmake-format
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ select = [
"UP007",
# Import from `collections.abc` instead: `Callable`
"UP035",
"NPY002",
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
]
ignore = [
# whitespace before :
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/benchmarks/API/bench_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,13 @@ def bench_pivot_table_simple(benchmark, dataframe):

@pytest_cases.parametrize("nr", NUM_ROWS)
def bench_crosstab_simple(benchmark, nr):
rng = np.random.default_rng(seed=0)
series_a = np.array(["foo", "bar"] * nr)
series_b = np.array(["one", "two"] * nr)
series_c = np.array(["dull", "shiny"] * nr)
np.random.shuffle(series_a)
np.random.shuffle(series_b)
np.random.shuffle(series_c)
rng.shuffle(series_a)
rng.shuffle(series_b)
rng.shuffle(series_c)
series_a = cudf.Series(series_a)
series_b = cudf.Series(series_b)
series_c = cudf.Series(series_c)
Expand Down
12 changes: 7 additions & 5 deletions python/cudf/benchmarks/API/bench_multiindex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

"""Benchmarks of MultiIndex methods."""

Expand All @@ -11,16 +11,18 @@
@pytest.fixture
def pidx():
num_elements = int(1e3)
a = np.random.randint(0, num_elements // 10, num_elements)
b = np.random.randint(0, num_elements // 10, num_elements)
rng = np.random.default_rng(seed=0)
a = rng.integers(0, num_elements // 10, num_elements)
b = rng.integers(0, num_elements // 10, num_elements)
return pd.MultiIndex.from_arrays([a, b], names=("a", "b"))


@pytest.fixture
def midx(pidx):
num_elements = int(1e3)
a = np.random.randint(0, num_elements // 10, num_elements)
b = np.random.randint(0, num_elements // 10, num_elements)
rng = np.random.default_rng(seed=0)
a = rng.integers(0, num_elements // 10, num_elements)
b = rng.integers(0, num_elements // 10, num_elements)
df = cudf.DataFrame({"a": a, "b": b})
return cudf.MultiIndex.from_frame(df)

Expand Down
15 changes: 8 additions & 7 deletions python/cudf/cudf/_fuzz_testing/avro.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import copy
import io
Expand Down Expand Up @@ -68,12 +68,12 @@ def generate_input(self):
# https://github.com/rapidsai/cudf/issues/6604
- cudf.utils.dtypes.TIMEDELTA_TYPES
)

seed = random.randint(0, 2**32 - 1)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2**32 - 1)

self._current_params["seed"] = seed
self._current_params["num_rows"] = num_rows
self._current_params["num_cols"] = num_cols
Expand All @@ -100,17 +100,18 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if values == ALL_POSSIBLE_VALUES:
if param == "columns":
col_size = self._rand(len(self._df.columns))
params_dict[param] = list(
np.unique(np.random.choice(self._df.columns, col_size))
np.unique(rng.choice(self._df.columns, col_size))
)
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
params_dict[param] = rng.choice(
[None, self._rand(len(self._df))]
)
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
40 changes: 18 additions & 22 deletions python/cudf/cudf/_fuzz_testing/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def generate_input(self):
random.seed(seed)
dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["seed"] = seed
Expand All @@ -77,25 +77,22 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if values == ALL_POSSIBLE_VALUES:
if param == "usecols":
col_size = self._rand(len(self._df.columns))
col_val = np.random.choice(
col_val = rng.choice(
[
None,
np.unique(
np.random.choice(self._df.columns, col_size)
),
np.unique(rng.choice(self._df.columns, col_size)),
]
)
params_dict[param] = (
col_val if col_val is None else list(col_val)
)
elif param == "dtype":
dtype_val = np.random.choice(
[None, self._df.dtypes.to_dict()]
)
dtype_val = rng.choice([None, self._df.dtypes.to_dict()])
if dtype_val is not None:
dtype_val = {
col_name: "category"
Expand All @@ -105,25 +102,25 @@ def set_rand_params(self, params):
}
params_dict[param] = dtype_val
elif param == "header":
header_val = np.random.choice(
["infer", np.random.randint(low=0, high=len(self._df))]
header_val = rng.choice(
["infer", rng.integers(low=0, high=len(self._df))]
)
params_dict[param] = header_val
elif param == "skiprows":
params_dict[param] = np.random.randint(
params_dict[param] = rng.integers(
low=0, high=len(self._df)
)
elif param == "skipfooter":
params_dict[param] = np.random.randint(
params_dict[param] = rng.integers(
low=0, high=len(self._df)
)
elif param == "nrows":
nrows_val = np.random.choice(
[None, np.random.randint(low=0, high=len(self._df))]
nrows_val = rng.choice(
[None, rng.integers(low=0, high=len(self._df))]
)
params_dict[param] = nrows_val
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)


Expand Down Expand Up @@ -159,7 +156,7 @@ def generate_input(self):
random.seed(seed)
dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["seed"] = seed
Expand All @@ -182,26 +179,25 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if values == ALL_POSSIBLE_VALUES:
if param == "columns":
col_size = self._rand(len(self._current_buffer.columns))
params_dict[param] = list(
np.unique(
np.random.choice(
self._current_buffer.columns, col_size
)
rng.choice(self._current_buffer.columns, col_size)
)
)
elif param == "chunksize":
params_dict[param] = np.random.choice(
params_dict[param] = rng.choice(
[
None,
np.random.randint(
rng.integers(
low=1, high=max(1, len(self._current_buffer))
),
]
)
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
5 changes: 3 additions & 2 deletions python/cudf/cudf/_fuzz_testing/io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import copy
import json
Expand Down Expand Up @@ -91,8 +91,9 @@ def get_next_regression_params(self):
return dtypes_meta, num_rows, num_cols, seed

def set_rand_params(self, params):
rng = np.random.default_rng(seed=None)
params_dict = {
param: np.random.choice(values) for param, values in params.items()
param: rng.choice(values) for param, values in params.items()
}
self._current_params["test_kwargs"] = self.process_kwargs(
params_dict=params_dict
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def generate_input(self):
# https://github.com/rapidsai/cudf/issues/7086
# dtypes_list.extend(["list"])
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["seed"] = seed
Expand All @@ -105,14 +105,15 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if param == "dtype" and values == ALL_POSSIBLE_VALUES:
dtype_val = np.random.choice(
dtype_val = rng.choice(
[True, self._current_buffer.dtypes.to_dict()]
)
params_dict[param] = _get_dtype_param_value(dtype_val)
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)


Expand Down Expand Up @@ -155,7 +156,7 @@ def generate_input(self):
# https://github.com/rapidsai/cudf/issues/7086
# dtypes_list.extend(["list"])
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["seed"] = seed
Expand All @@ -180,12 +181,13 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if param == "dtype" and values == ALL_POSSIBLE_VALUES:
dtype_val = np.random.choice(
dtype_val = rng.choice(
[True, self._current_buffer.dtypes.to_dict()]
)
params_dict[param] = _get_dtype_param_value(dtype_val)
else:
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
28 changes: 12 additions & 16 deletions python/cudf/cudf/_fuzz_testing/orc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

import copy
import io
Expand Down Expand Up @@ -62,13 +62,11 @@ def generate_input(self):
- cudf.utils.dtypes.UNSIGNED_TYPES
- {"datetime64[ns]"}
)

seed = random.randint(0, 2**32 - 1)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)

self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2**32 - 1)
self._current_params["seed"] = seed
self._current_params["num_rows"] = num_rows
self._current_params["num_cols"] = num_cols
Expand All @@ -94,42 +92,41 @@ def write_data(self, file_name):

def set_rand_params(self, params):
params_dict = {}
rng = np.random.default_rng(seed=None)
for param, values in params.items():
if values == ALL_POSSIBLE_VALUES:
if param == "columns":
col_size = self._rand(len(self._df.columns))
params_dict[param] = list(
np.unique(np.random.choice(self._df.columns, col_size))
np.unique(rng.choice(self._df.columns, col_size))
)
elif param == "stripes":
f = io.BytesIO(self._current_buffer)
orcFile = pa.orc.ORCFile(f)
stripes = list(range(orcFile.nstripes))
params_dict[param] = np.random.choice(
params_dict[param] = rng.choice(
[
None,
list(
map(
int,
np.unique(
np.random.choice(
stripes, orcFile.nstripes
)
rng.choice(stripes, orcFile.nstripes)
),
)
),
]
)
elif param == "use_index":
params_dict[param] = np.random.choice([True, False])
params_dict[param] = rng.choice([True, False])
elif param in ("skiprows", "num_rows"):
params_dict[param] = np.random.choice(
params_dict[param] = rng.choice(
[None, self._rand(len(self._df))]
)
else:
if not isinstance(values, list):
raise TypeError("values must be of type list")
params_dict[param] = np.random.choice(values)
params_dict[param] = rng.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)


Expand Down Expand Up @@ -177,12 +174,11 @@ def generate_input(self):
# https://github.com/rapidsai/cudf/issues/7355
- cudf.utils.dtypes.DATETIME_TYPES
)

seed = random.randint(0, 2**32 - 1)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
self, dtypes_list, seed
)
self._current_params["dtypes_meta"] = dtypes_meta
seed = random.randint(0, 2**32 - 1)
self._current_params["seed"] = seed
self._current_params["num_rows"] = num_rows
self._current_params["num_cols"] = num_cols
Expand Down
Loading
Loading