intel · md-shafiul-alam · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024
@@ -150,7 +150,13 @@ def _onedal_fit(self, X, sample_weight=None, queue=None):
             self._validate_params()
 
         if sklearn_check_version("1.0"):
-            X = validate_data(self, X, dtype=[np.float64, np.float32], ensure_2d=False)
+            X = validate_data(
+                self,
+                X,
+                dtype=[np.float64, np.float32],
+                ensure_2d=False,
+                accept_sparse="csr",
+            )
         else:
             X = check_array(X, dtype=[np.float64, np.float32])
 

@@ -17,6 +17,8 @@
 import numpy as np
 import pytest
 from numpy.testing import assert_allclose
+from scipy.sparse import csr_matrix
+from sklearn.datasets import make_blobs
 
 from daal4py.sklearn._utils import daal_check_version
 from onedal.basic_statistics.tests.test_basic_statistics import (
@@ -28,6 +30,7 @@
 from onedal.tests.utils._dataframes_support import (
     _convert_to_dataframe,
     get_dataframes_and_queues,
+    get_queues,
 )
 from sklearnex.basic_statistics import BasicStatistics
 
@@ -178,6 +181,55 @@ def test_multiple_options_on_random_data(
     assert_allclose(gtr_sum, res_sum, atol=tol)
 
 
+@pytest.mark.parametrize("queue", get_queues())
+@pytest.mark.parametrize("row_count", [100, 1000])
+@pytest.mark.parametrize("column_count", [10, 100])
+@pytest.mark.parametrize("weighted", [True, False])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_multiple_options_on_random_sparse_data(
+    queue, row_count, column_count, weighted, dtype
+):
+    seed = 77
+    random_state = 42
+    gen = np.random.default_rng(seed)
+    X, _ = make_blobs(
+        n_samples=row_count, n_features=column_count, random_state=random_state
+    )
+    density = 0.05
+    X_sparse = csr_matrix(X * (np.random.rand(*X.shape) < density))
+    X_dense = X_sparse.toarray()
+
+    if weighted:
+        weights = gen.uniform(low=-0.5, high=1.0, size=row_count)
+        weights = weights.astype(dtype=dtype)
+    basicstat = BasicStatistics(result_options=["mean", "max", "sum"])
+
+    if weighted:
+        result = basicstat.fit(X_sparse, sample_weight=weights)
+    else:
+        result = basicstat.fit(X_sparse)
+
+    res_mean, res_max, res_sum = result.mean, result.max, result.sum
+    if weighted:
+        weighted_data = np.diag(weights) @ X_dense
+        gtr_mean, gtr_max, gtr_sum = (
+            expected_mean(weighted_data),
+            expected_max(weighted_data),
+            expected_sum(weighted_data),
+        )
+    else:
+        gtr_mean, gtr_max, gtr_sum = (
+            expected_mean(X_dense),
+            expected_max(X_dense),
+            expected_sum(X_dense),
+        )
+
+    tol = 5e-4 if res_mean.dtype == np.float32 else 1e-7
+    assert_allclose(gtr_mean, res_mean, atol=tol)
+    assert_allclose(gtr_max, res_max, atol=tol)
+    assert_allclose(gtr_sum, res_sum, atol=tol)
+
+
 @pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues())
 @pytest.mark.parametrize("row_count", [100, 1000])
 @pytest.mark.parametrize("column_count", [10, 100])