From bcc000554c8bd9ebd5cafe7c61b1f0090ab9d53b Mon Sep 17 00:00:00 2001 From: Wenjun Si Date: Thu, 2 Nov 2023 11:13:52 +0800 Subject: [PATCH] Fix test failures (#3362) --- .github/workflows/benchmark-ci.yml | 2 +- .github/workflows/platform-ci.yml | 7 ++-- mars/dataframe/contrib/raydataset/dataset.py | 6 ++-- .../dataframe/contrib/raydataset/mldataset.py | 5 +-- .../raydataset/tests/test_mldataset.py | 6 +--- mars/dataframe/datasource/read_raydataset.py | 12 +++++++ .../tests/test_datasource_execution.py | 4 --- .../merge/tests/test_merge_execution.py | 32 +++++++++++-------- mars/dataframe/utils.py | 4 +-- mars/learn/contrib/lightgbm/_predict.py | 2 +- mars/learn/contrib/lightgbm/_train.py | 6 ++-- mars/learn/contrib/lightgbm/core.py | 1 - .../contrib/lightgbm/tests/test_classifier.py | 8 ++--- mars/learn/linear_model/_base.py | 2 +- mars/learn/linear_model/tests/test_base.py | 4 +-- mars/learn/metrics/pairwise/pairwise.py | 2 +- mars/storage/vineyard.py | 10 +++--- mars/tensor/base/tile.py | 2 +- mars/tensor/utils.py | 2 +- mars/utils.py | 2 ++ setup.cfg | 2 +- 21 files changed, 66 insertions(+), 55 deletions(-) diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml index 0e0d44ee1e..5e924bc483 100644 --- a/.github/workflows/benchmark-ci.yml +++ b/.github/workflows/benchmark-ci.yml @@ -57,7 +57,7 @@ jobs: git fetch upstream git merge upstream/master asv machine --yes - asv continuous -e -f 1.1 --strict upstream/master HEAD + asv continuous -e -f 1.1 upstream/master HEAD if: ${{ steps.build.outcome == 'success' }} - name: Publish benchmarks artifact diff --git a/.github/workflows/platform-ci.yml b/.github/workflows/platform-ci.yml index 9ded51b9ef..ed21ea42d9 100644 --- a/.github/workflows/platform-ci.yml +++ b/.github/workflows/platform-ci.yml @@ -89,7 +89,9 @@ jobs: ./ci/install-hadoop.sh echo "import coverage; coverage.process_startup()" > \ $(python -c "import site; print(site.getsitepackages()[-1])")/coverage.pth - conda install -n test --quiet --yes -c conda-forge python=$PYTHON skein libffi conda-pack + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + sudo apt install -y g++-11 + conda install -n test --quiet --yes -c conda-forge python=$PYTHON skein libffi conda-pack "grpcio<1.54" fi if [ -n "$WITH_VINEYARD" ]; then pip install vineyard -i https://pypi.org/simple @@ -104,8 +106,7 @@ jobs: rm -fr /tmp/etcd-$ETCD_VER-linux-amd64.tar.gz /tmp/etcd-download-test fi if [ -n "$WITH_RAY" ] || [ -n "$WITH_RAY_DAG" ] || [ -n "$WITH_RAY_DEPLOY" ]; then - pip install "ray>=1.8.0,<2.4.0" - pip install "xgboost_ray<0.1.14" "protobuf<4" + pip install "ray>=1.8.0,<2.4.0" "xgboost<2" "xgboost_ray<0.1.14" "protobuf<4" # Ray Datasets need pyarrow>=6.0.1 pip install "pyarrow>=6.0.1" pip install lightgbm diff --git a/mars/dataframe/contrib/raydataset/dataset.py b/mars/dataframe/contrib/raydataset/dataset.py index 21581c59b8..0069cfa220 100644 --- a/mars/dataframe/contrib/raydataset/dataset.py +++ b/mars/dataframe/contrib/raydataset/dataset.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import operator from functools import reduce @@ -55,8 +56,9 @@ def __getstate__(): state.pop("dataframe", None) return state - # `dataframe` is not serializable by ray. - dataset.__getstate__ = __getstate__ + if not hasattr(type(dataset), "__getstate__"): + # if `dataframe` is not serializable by ray, patch our implementation + dataset.__getstate__ = __getstate__ return dataset diff --git a/mars/dataframe/contrib/raydataset/mldataset.py b/mars/dataframe/contrib/raydataset/mldataset.py index b34b5ca333..abf05c5e7a 100644 --- a/mars/dataframe/contrib/raydataset/mldataset.py +++ b/mars/dataframe/contrib/raydataset/mldataset.py @@ -131,6 +131,7 @@ def __getstate__(): state.pop("dataframe", None) return state - # `dataframe` is not serializable by ray. - dataset.__getstate__ = __getstate__ + if not hasattr(dataset, "__getstate__"): + # `dataframe` is not serializable by ray. + dataset.__getstate__ = __getstate__ return dataset diff --git a/mars/dataframe/contrib/raydataset/tests/test_mldataset.py b/mars/dataframe/contrib/raydataset/tests/test_mldataset.py index 81191ce902..2ea907b646 100644 --- a/mars/dataframe/contrib/raydataset/tests/test_mldataset.py +++ b/mars/dataframe/contrib/raydataset/tests/test_mldataset.py @@ -28,11 +28,7 @@ ray = lazy_import("ray") ml_dataset = lazy_import("ray.util.data", rename="ml_dataset") - -try: - import xgboost_ray -except ImportError: # pragma: no cover - xgboost_ray = None +xgboost_ray = lazy_import("xgboost_ray") try: import sklearn except ImportError: # pragma: no cover diff --git a/mars/dataframe/datasource/read_raydataset.py b/mars/dataframe/datasource/read_raydataset.py index 4a14b047c6..090abbaa03 100644 --- a/mars/dataframe/datasource/read_raydataset.py +++ b/mars/dataframe/datasource/read_raydataset.py @@ -123,9 +123,21 @@ def read_ray_dataset(ds, columns=None, incremental_index=False, **kwargs): from ray.data.impl.pandas_block import PandasBlockSchema except ImportError: # pragma: no cover PandasBlockSchema = type(None) + try: + from ray.data.dataset import Schema as RayDatasetSchema + except ImportError: + RayDatasetSchema = type(None) if isinstance(schema, PandasBlockSchema): dtypes = pd.Series(schema.types, index=schema.names) + elif isinstance(schema, RayDatasetSchema): + dtypes = pd.Series( + [ + t.to_pandas_dtype() if t is not object else np.dtype("O") + for t in schema.types + ], + index=schema.names, + ) elif isinstance(schema, pa.Schema): dtypes = schema.empty_table().to_pandas().dtypes else: diff --git a/mars/dataframe/datasource/tests/test_datasource_execution.py b/mars/dataframe/datasource/tests/test_datasource_execution.py index 3dd695f941..2bbb777085 100644 --- a/mars/dataframe/datasource/tests/test_datasource_execution.py +++ b/mars/dataframe/datasource/tests/test_datasource_execution.py @@ -1288,10 +1288,6 @@ def test_read_raydataset(ray_start_regular, ray_create_mars_cluster): pdf2, ) - # Test simple datasets - with pytest.raises(NotImplementedError): - ray.data.range(10).to_mars() - @require_ray @pytest.mark.skipif( diff --git a/mars/dataframe/merge/tests/test_merge_execution.py b/mars/dataframe/merge/tests/test_merge_execution.py index 4bb8292ea1..62281b7199 100644 --- a/mars/dataframe/merge/tests/test_merge_execution.py +++ b/mars/dataframe/merge/tests/test_merge_execution.py @@ -312,11 +312,15 @@ def test_join_on(setup): expected4.set_index("a2", inplace=True) result4.set_index("a2", inplace=True) pd.testing.assert_frame_equal( - sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0) + sort_dataframe_inplace(expected4, 0, kind="mergesort"), + sort_dataframe_inplace(result4, 0, kind="mergesort"), ) def test_merge_one_chunk(setup): + def sort_by_col1(df): + return df.sort_values(by=df.columns[1], kind="mergesort") + df1 = pd.DataFrame( {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}, index=["a1", "a2", "a3", "a4"], @@ -348,8 +352,8 @@ def test_merge_one_chunk(setup): result = jdf.execute().fetch() pd.testing.assert_frame_equal( - expected.sort_values(by=expected.columns[1]).reset_index(drop=True), - result.sort_values(by=result.columns[1]).reset_index(drop=True), + sort_by_col1(expected).reset_index(drop=True), + sort_by_col1(result).reset_index(drop=True), ) # right have one chunk @@ -361,8 +365,8 @@ def test_merge_one_chunk(setup): result = jdf.execute().fetch() pd.testing.assert_frame_equal( - expected.sort_values(by=expected.columns[1]).reset_index(drop=True), - result.sort_values(by=result.columns[1]).reset_index(drop=True), + sort_by_col1(expected).reset_index(drop=True), + sort_by_col1(result).reset_index(drop=True), ) # left have one chunk and how="left", then one chunk tile @@ -377,8 +381,8 @@ def test_merge_one_chunk(setup): result = jdf.execute().fetch() pd.testing.assert_frame_equal( - expected.sort_values(by=expected.columns[1]).reset_index(drop=True), - result.sort_values(by=result.columns[1]).reset_index(drop=True), + sort_by_col1(expected).reset_index(drop=True), + sort_by_col1(result).reset_index(drop=True), ) @@ -418,7 +422,8 @@ def test_broadcast_merge(setup): expected.set_index("key", inplace=True) result.set_index("key", inplace=True) pd.testing.assert_frame_equal( - sort_dataframe_inplace(expected, 0), sort_dataframe_inplace(result, 0) + sort_dataframe_inplace(expected, 0, kind="mergesort"), + sort_dataframe_inplace(result, 0, kind="mergesort"), ) # test broadcast right and how="left" @@ -438,8 +443,8 @@ def test_broadcast_merge(setup): expected.set_index("key", inplace=True) result.set_index("key", inplace=True) pd.testing.assert_frame_equal( - expected.sort_values(by=["key", "value_x"]), - result.sort_values(by=["key", "value_x"]), + expected.sort_values(by=["key", "value_x"], kind="mergesort"), + result.sort_values(by=["key", "value_x"], kind="mergesort"), ) # test broadcast left @@ -459,7 +464,8 @@ def test_broadcast_merge(setup): expected.set_index("key", inplace=True) result.set_index("key", inplace=True) pd.testing.assert_frame_equal( - sort_dataframe_inplace(expected, 0), sort_dataframe_inplace(result, 0) + sort_dataframe_inplace(expected, 0, kind="mergesort"), + sort_dataframe_inplace(result, 0, kind="mergesort"), ) # test broadcast left and how="right" @@ -479,8 +485,8 @@ def test_broadcast_merge(setup): expected.set_index("key", inplace=True) result.set_index("key", inplace=True) pd.testing.assert_frame_equal( - expected.sort_values(by=["key", "value_x"]), - result.sort_values(by=["key", "value_x"]), + expected.sort_values(by=["key", "value_x"], kind="mergesort"), + result.sort_values(by=["key", "value_x"], kind="mergesort"), ) diff --git a/mars/dataframe/utils.py b/mars/dataframe/utils.py index 513c99208c..9df187bb6a 100644 --- a/mars/dataframe/utils.py +++ b/mars/dataframe/utils.py @@ -106,9 +106,9 @@ def hash_dtypes(dtypes, size): return [dtypes[index] for index in hashed_indexes] -def sort_dataframe_inplace(df, *axis): +def sort_dataframe_inplace(df, *axis, **kw): for ax in axis: - df.sort_index(axis=ax, inplace=True) + df.sort_index(axis=ax, inplace=True, **kw) return df diff --git a/mars/learn/contrib/lightgbm/_predict.py b/mars/learn/contrib/lightgbm/_predict.py index 3fbf021642..857acf031c 100644 --- a/mars/learn/contrib/lightgbm/_predict.py +++ b/mars/learn/contrib/lightgbm/_predict.py @@ -78,7 +78,7 @@ def __call__(self): elif hasattr(self.model, "classes_"): dtype = np.array(self.model.classes_).dtype else: - dtype = getattr(self.model, "out_dtype_", np.dtype("float")) + dtype = getattr(self.model, "out_dtype_", [np.dtype("float")])[0] if self.output_types[0] == OutputType.tensor: # tensor diff --git a/mars/learn/contrib/lightgbm/_train.py b/mars/learn/contrib/lightgbm/_train.py index cf0f42cc95..2efb427212 100644 --- a/mars/learn/contrib/lightgbm/_train.py +++ b/mars/learn/contrib/lightgbm/_train.py @@ -406,11 +406,11 @@ def execute(cls, ctx, op: "LGBMTrain"): op.model_type == LGBMModelType.RANKER or op.model_type == LGBMModelType.REGRESSOR ): - model.set_params(out_dtype_=np.dtype("float")) + model.set_params(out_dtype_=[np.dtype("float")]) elif hasattr(label_val, "dtype"): - model.set_params(out_dtype_=label_val.dtype) + model.set_params(out_dtype_=[label_val.dtype]) else: - model.set_params(out_dtype_=label_val.dtypes[0]) + model.set_params(out_dtype_=[label_val.dtypes[0]]) ctx[op.outputs[0].key] = pickle.dumps(model) finally: diff --git a/mars/learn/contrib/lightgbm/core.py b/mars/learn/contrib/lightgbm/core.py index ff050cdbb0..7da06cfa56 100644 --- a/mars/learn/contrib/lightgbm/core.py +++ b/mars/learn/contrib/lightgbm/core.py @@ -20,7 +20,6 @@ import pandas as pd from ....dataframe import DataFrame as MarsDataFrame, Series as MarsSeries -from ....lib.version import parse as parse_version from ....tensor import tensor as mars_tensor diff --git a/mars/learn/contrib/lightgbm/tests/test_classifier.py b/mars/learn/contrib/lightgbm/tests/test_classifier.py index db7425fc86..28fd623421 100644 --- a/mars/learn/contrib/lightgbm/tests/test_classifier.py +++ b/mars/learn/contrib/lightgbm/tests/test_classifier.py @@ -75,9 +75,7 @@ def test_local_classifier(create_cluster): # test sparse tensor X_sparse_data = X_sparse classifier = LGBMClassifier(n_estimators=2) - classifier.fit( - X_sparse_data, y_data, eval_set=[(X_sparse_data, y_data)] - ) + classifier.fit(X_sparse_data, y_data, eval_set=[(X_sparse_data, y_data)]) prediction = classifier.predict(X_sparse_data) assert prediction.ndim == 1 @@ -118,9 +116,7 @@ def test_local_classifier(create_cluster): # should raise error if weight.ndim > 1 with pytest.raises(ValueError): - LGBMClassifier(n_estimators=2).fit( - X, y_df, sample_weight=mt.random.rand(1, 1) - ) + LGBMClassifier(n_estimators=2).fit(X, y_df, sample_weight=mt.random.rand(1, 1)) # test binary classifier new_y = (y_data > 0.5).astype(mt.int32) diff --git a/mars/learn/linear_model/_base.py b/mars/learn/linear_model/_base.py index 57cf04444e..89a68f19a4 100644 --- a/mars/learn/linear_model/_base.py +++ b/mars/learn/linear_model/_base.py @@ -301,7 +301,7 @@ def fit(self, X, y, sample_weight=None): self.coef_.execute() except LinAlgError: # TODO: implement linalg.lstsq first - raise NotImplementedError("Does not support sigular matrix!") + raise NotImplementedError("Does not support singular matrix!") if y.ndim == 1: self.coef_ = mt.ravel(self.coef_) diff --git a/mars/learn/linear_model/tests/test_base.py b/mars/learn/linear_model/tests/test_base.py index ff712d786b..519113f864 100644 --- a/mars/learn/linear_model/tests/test_base.py +++ b/mars/learn/linear_model/tests/test_base.py @@ -53,7 +53,7 @@ def test_linear_regression(setup): assert_array_almost_equal(reg.predict(X), model.predict(X)) # Regular model fitting, #samples <= 2, # features < 2 - error_msg = re.escape("Does not support sigular matrix!") + error_msg = re.escape("Does not support singular matrix!") X = [[1], [2]] Y = [1, 2] @@ -69,7 +69,7 @@ def test_linear_regression(setup): assert_array_almost_equal(reg.predict(X), model.predict(X)) # Extra case #1: singular matrix, degenerate input - error_msg = re.escape("Does not support sigular matrix!") + error_msg = re.escape("Does not support singular matrix!") X = [[1]] Y = [0] diff --git a/mars/learn/metrics/pairwise/pairwise.py b/mars/learn/metrics/pairwise/pairwise.py index ff0f991246..f3aba75d97 100644 --- a/mars/learn/metrics/pairwise/pairwise.py +++ b/mars/learn/metrics/pairwise/pairwise.py @@ -72,7 +72,7 @@ "precomputed": None, # HACK: precomputed is always allowed, never called } -# These distances recquire boolean tensors, when using mars.tensor.spatial.distance +# These distances require boolean tensors, when using mars.tensor.spatial.distance PAIRWISE_BOOLEAN_FUNCTIONS = [ "dice", "jaccard", diff --git a/mars/storage/vineyard.py b/mars/storage/vineyard.py index 3eea9d2fdc..1ce03f0221 100644 --- a/mars/storage/vineyard.py +++ b/mars/storage/vineyard.py @@ -144,11 +144,11 @@ async def setup(cls, **kwargs) -> Tuple[Dict, Dict]: vineyard_store = None else: vineyard_store = vineyard.deploy.local.start_vineyardd( - etcd_endpoints, - etcd_prefix, - vineyardd_path, - vineyard_size, - vineyard_socket, + etcd_endpoints=etcd_endpoints, + etcd_prefix=etcd_prefix, + vineyardd_path=vineyardd_path, + size=vineyard_size, + socket=vineyard_socket, rpc=False, ) vineyard_socket = ( diff --git a/mars/tensor/base/tile.py b/mars/tensor/base/tile.py index fc9e344939..983c0d03ad 100644 --- a/mars/tensor/base/tile.py +++ b/mars/tensor/base/tile.py @@ -30,7 +30,7 @@ def tile(A, reps): behavior, promote `A` to d-dimensions manually before calling this function. - If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it. + If ``A.ndim > d``, `reps` is promoted to `A`.ndim by prepending 1's to it. Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as (1, 1, 2, 2). diff --git a/mars/tensor/utils.py b/mars/tensor/utils.py index 78026f2768..3e99dca616 100644 --- a/mars/tensor/utils.py +++ b/mars/tensor/utils.py @@ -774,7 +774,7 @@ def fetch_corner_data(tensor, session=None): # the tensor must have been executed, # thus the size could not be NaN if tensor.size > threshold: - # two edges for each exis + # two edges for each axis indices_iter = list(itertools.product(*(range(2) for _ in range(tensor.ndim)))) corners = np.empty(shape=(2,) * tensor.ndim, dtype=object) shape = [0 for _ in range(tensor.ndim)] diff --git a/mars/utils.py b/mars/utils.py index b08ec8e2f9..6f2b807d94 100644 --- a/mars/utils.py +++ b/mars/utils.py @@ -380,8 +380,10 @@ def __getattr__(self, item): elif locals is not None: locals[rename] = real_mod ret = getattr(real_mod, item) + for on_load_func in self._on_loads: on_load_func() + # make sure on_load hooks only executed once self._on_loads = [] return ret diff --git a/setup.cfg b/setup.cfg index 9f9fcf17ce..d8415d3fc1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ install_requires = pandas>=1.0.0,<2.0.0 scipy>=1.0.0 scikit-learn>=0.20 - numexpr>=2.6.4,!=2.8.5 + numexpr>=2.6.4,!=2.8.5,!=2.8.6 cloudpickle>=1.5.0 pyyaml>=5.1 psutil>=5.9.0