Merge pull request #49 from tvdboom/dataengines2

Dataengines2
tvdboom · Feb 26, 2024 · 7810077 · 7810077
2 parents ff9468e + a59a3b5
commit 7810077
Show file tree

Hide file tree

Showing 46 changed files with 3,057 additions and 2,472 deletions.
diff --git a/atom/_show_versions.py b/atom/_show_versions.py
@@ -20,12 +20,11 @@
     "atom",
     "beartype",
     "category_encoders",
-    "dagshub",
     "dill",
+    "featuretools",
     "gplearn",
     "imblearn",
     "ipywidgets",
-    "featuretools",
     "joblib",
     "matplotlib",
     "mlflow",
@@ -35,14 +34,31 @@
     "optuna",
     "pandas",
     "plotly",
-    "ray",
-    "requests",
     "sklearn",
-    "sklearnex",  # Has no __version__ attribute
     "scipy",
     "shap",
     "sktime",
+    "statsmodels",
     "zoofs",  # Has no __version__ attribute
+    "botorch",
+    "catboost",
+    "dagshub",
+    "dask[distributed]",
+    "explainerdashboard",
+    "gradio",
+    "lightgbm",
+    "modin[ray]",
+    "polars",
+    "pyarrow",
+    "pyspark",
+    "ray[serve]",
+    "requests",
+    "sklearnex",
+    "schemdraw",
+    "statsforecast",
+    "sweetviz",
+    "wordcloud",
+    "xgboost",
 ]
 
 

diff --git a/atom/api.py b/atom/api.py
@@ -158,20 +158,20 @@ class ATOMClassifier(ATOM):
         **X, train, test: dataframe-like**<br>
         Feature set with shape=(n_samples, n_features).
 
-        **y: int, str or sequence**<br>
-        Target column corresponding to `X`.
+        **y: int, str, sequence or dataframe-like**<br>
+        Target column(s) corresponding to `X`.
 
-        - If int: Position of the target column in X.
-        - If str: Name of the target column in X.
+        - If int: Position of the target column in `X`.
+        - If str: Name of the target column in `X`.
         - If sequence: Target column with shape=(n_samples,) or
           sequence of column names or positions for multioutput tasks.
-        - If dataframe: Target columns for multioutput tasks.
+        - If dataframe-like: Target columns for multioutput tasks.
 
-    y: int, str, dict, sequence or dataframe, default=-1
-        Target column corresponding to `X`.
+    y: int, str, sequence or dataframe-like, default=-1
+        Target column(s) corresponding to `X`.
 
-        - If int: Position of the target column in X.
-        - If str: Name of the target column in X.
+        - If int: Position of the target column in `X`.
+        - If str: Name of the target column in `X`.
         - If sequence: Target column with shape=(n_samples,) or
           sequence of column names or positions for multioutput tasks.
         - If dataframe: Target columns for multioutput tasks.
@@ -257,9 +257,16 @@ class ATOMClassifier(ATOM):
 
         - "data":
 
+            - "numpy"
             - "pandas" (default)
+            - "pandas-pyarrow"
+            - "polars"
+            - "polars-lazy"
             - "pyarrow"
             - "modin"
+            - "dask"
+            - "pyspark"
+            - "pyspark-pandas"
 
         - "estimator":
 
@@ -276,6 +283,7 @@ class ATOMClassifier(ATOM):
           parallelism. Less robust than `loky`.
         - "threading": Single-node, thread-based parallelism.
         - "ray": Multi-node, process-based parallelism.
+        - "dask": Multi-node, process-based parallelism.
 
     memory: bool, str, Path or Memory, default=False
         Enables caching for memory optimization. Read more in the
@@ -428,24 +436,24 @@ class ATOMForecaster(ATOM):
         Exogenous feature set corresponding to y, with shape=(n_samples,
         n_features).
 
-        **y: int, str or sequence**<br>
+        **y: int, str, sequence or dataframe-like**<br>
         Time series.
 
-        - If int: Position of the target column in X.
-        - If str: Name of the target column in X.
+        - If int: Position of the target column in `X`.
+        - If str: Name of the target column in `X`.
         - If sequence: Target column with shape=(n_samples,) or
           sequence of column names or positions for multioutput tasks.
-        - If dataframe: Target columns for multioutput tasks.
+        - If dataframe-like: Target columns for multioutput tasks.
 
-    y: int, str, dict, sequence or dataframe, default=-1
+    y: int, str, sequence or dataframe-like, default=-1
         Time series.
 
-        - If None: y is ignored.
-        - If int: Position of the target column in X.
-        - If str: Name of the target column in X.
+        - If None: `y` is ignored.
+        - If int: Position of the target column in `X`.
+        - If str: Name of the target column in `X`.
         - If sequence: Target column with shape=(n_samples,) or
           sequence of column names or positions for multioutput tasks.
-        - If dataframe: Target columns for multioutput tasks.
+        - If dataframe-like: Target columns for multioutput tasks.
 
         This parameter is ignored if the time series is provided
         through `arrays`.
@@ -526,9 +534,16 @@ class ATOMForecaster(ATOM):
 
         - "data":
 
+            - "numpy"
             - "pandas" (default)
+            - "pandas-pyarrow"
+            - "polars"
+            - "polars-lazy"
             - "pyarrow"
             - "modin"
+            - "dask"
+            - "pyspark"
+            - "pyspark-pandas"
 
         - "estimator":
 
@@ -545,6 +560,7 @@ class ATOMForecaster(ATOM):
           parallelism. Less robust than `loky`.
         - "threading": Single-node, thread-based parallelism.
         - "ray": Multi-node, process-based parallelism.
+        - "dask": Multi-node, process-based parallelism.
 
     memory: bool, str, Path or Memory, default=False
         Enables caching for memory optimization. Read more in the
@@ -689,24 +705,24 @@ class ATOMRegressor(ATOM):
         **X, train, test: dataframe-like**<br>
         Feature set with shape=(n_samples, n_features).
 
-        **y: int, str or sequence**<br>
-        Target column corresponding to `X`.
+        **y: int, str, sequence or dataframe-like**<br>
+        Target column(s) corresponding to `X`.
 
-        - If int: Position of the target column in X.
-        - If str: Name of the target column in X.
+        - If int: Position of the target column in `X`.
+        - If str: Name of the target column in `X`.
         - If sequence: Target column with shape=(n_samples,) or
           sequence of column names or positions for multioutput tasks.
         - If dataframe: Target columns for multioutput tasks.
 
-    y: int, str, dict, sequence or dataframe, default=-1
-        Target column corresponding to `X`.
+    y: int, str, sequence or dataframe-like, default=-1
+        Target column(s) corresponding to `X`.
 
-        - If None: y is ignored.
-        - If int: Position of the target column in X.
-        - If str: Name of the target column in X.
+        - If None: `y` is ignored.
+        - If int: Position of the target column in `X`.
+        - If str: Name of the target column in `X`.
         - If sequence: Target column with shape=(n_samples,) or
           sequence of column names or positions for multioutput tasks.
-        - If dataframe: Target columns for multioutput tasks.
+        - If dataframe-like: Target columns for multioutput tasks.
 
         This parameter is ignored if the target column is provided
         through `arrays`.
@@ -775,9 +791,16 @@ class ATOMRegressor(ATOM):
 
         - "data":
 
+            - "numpy"
             - "pandas" (default)
+            - "pandas-pyarrow"
+            - "polars"
+            - "polars-lazy"
             - "pyarrow"
             - "modin"
+            - "dask"
+            - "pyspark"
+            - "pyspark-pandas"
 
         - "estimator":
 
@@ -794,6 +817,7 @@ class ATOMRegressor(ATOM):
           parallelism. Less robust than `loky`.
         - "threading": Single-node, thread-based parallelism.
         - "ray": Multi-node, process-based parallelism.
+        - "dask": Multi-node, process-based parallelism.
 
     memory: bool, str, Path or Memory, default=False
         Enables caching for memory optimization. Read more in the