memory considerations finished

tvdboom · Sep 14, 2023 · 0491ca0 · 0491ca0
1 parent 620f2eb
commit 0491ca0
Show file tree

Hide file tree

Showing 329 changed files with 5,360,044 additions and 369 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/atom/atom.py b/atom/atom.py
@@ -270,7 +270,7 @@ def outliers(self) -> pd.Series | None:
         """Columns in training set with number of outlier values."""
         if not is_sparse(self.X):
             data = self.train.select_dtypes(include=["number"])
-            z_scores = (np.abs(stats.zscore(data.values.astype(float))) > 3)
+            z_scores = (np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3)
             z_scores = pd.Series(z_scores.sum(axis=0), index=data.columns)
             return z_scores[z_scores > 0]
 
@@ -279,7 +279,8 @@ def n_outliers(self) -> Int | None:
         """Number of samples in the training set containing outliers."""
         if not is_sparse(self.X):
             data = self.train.select_dtypes(include=["number"])
-            return (np.abs(stats.zscore(data.values.astype(float))) > 3).any(axis=1).sum()
+            z_scores = (np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3)
+            return z_scores.any(axis=1).sum()
 
     @property
     def classes(self) -> pd.DataFrame | None:
@@ -870,8 +871,8 @@ def get_data(new_t: str) -> Series:
             )
 
         if self.engine.get("data") == "pyarrow":
-            self.branch.dataset = self.branch.dataset.astype(
-                {name: to_pyarrow(col) for name, col in self.branch._data.items()}
+            self.dataset = self.dataset.astype(
+                {name: to_pyarrow(col) for name, col in self.dataset.items()}
             )
 
         self._log("The column dtypes are successfully converted.", 1)

diff --git a/atom/basemodel.py b/atom/basemodel.py
@@ -1199,19 +1199,18 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
                         for step in range(len(value)):
                             mlflow.log_metric(f"evals_{key}", value[step], step=step)
 
-                # Rest of metrics are tracked when calling _get_score
+                # The Rest of the metrics are tracked when calling _get_score
                 mlflow.log_metric("time_fit", self.time_fit)
 
-                if self.log_model:
-                    mlflow.sklearn.log_model(
-                        sk_model=self.estimator,
-                        artifact_path=self._est_class.__name__,
-                        signature=infer_signature(
-                            model_input=pd.DataFrame(self.X),
-                            model_output=self.predict_test.to_numpy(),
-                        ),
-                        input_example=pd.DataFrame(self.X.iloc[[0], :]),
-                    )
+                mlflow.sklearn.log_model(
+                    sk_model=self.estimator,
+                    artifact_path=self._est_class.__name__,
+                    signature=infer_signature(
+                        model_input=pd.DataFrame(self.X),
+                        model_output=self.predict_test.to_numpy(),
+                    ),
+                    input_example=pd.DataFrame(self.X.iloc[[0], :]),
+                )
 
                 if self.log_data:
                     for ds in ("train", "test"):
@@ -1223,7 +1222,7 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
                 if self.log_pipeline:
                     mlflow.sklearn.log_model(
                         sk_model=self.export_pipeline(),
-                        artifact_path=f"{self.name}_pipeline",
+                        artifact_path=f"{self._est_class.__name__}_pipeline",
                         signature=infer_signature(
                             model_input=pd.DataFrame(self.X),
                             model_output=self.predict_test.to_numpy(),

diff --git a/atom/baserunner.py b/atom/baserunner.py
@@ -731,19 +731,18 @@ def save(self, filename: str = "auto", *, save_data: Bool = True):
             method to reload the instance.
 
         """
-        if not save_data and hasattr(self, "_branches"):
+        if not save_data:
             data = {}
             og = self._branches.og._data
             self._branches._og._data = None
             for branch in self._branches:
                 data[branch.name] = dict(
                     _data=deepcopy(branch._data),
                     _holdout=deepcopy(branch._holdout),
-                    holdout=branch.__dict__.get("holdout", None)
+                    holdout=branch.__dict__.pop("holdout", None)  # Clear cached holdout
                 )
                 branch._data = None
                 branch._holdout = None
-                branch.__dict__.pop("holdout", None)  # Clear cached holdout
 
         if filename.endswith("auto"):
             filename = filename.replace("auto", self.__class__.__name__)
@@ -756,13 +755,13 @@ def save(self, filename: str = "auto", *, save_data: Bool = True):
             pickle.dump(self, f)
 
         # Restore the data to the attributes
-        if not save_data and hasattr(self, "_branches"):
+        if not save_data:
             self._branches._og._data = og
             for branch in self._branches:
                 branch._data = data[branch.name]["_data"]
                 branch._holdout = data[branch.name]["_holdout"]
                 if data[branch.name]["holdout"] is not None:
-                    branch.holdout = data[branch.name]["holdout"]
+                    branch.__dict__["holdout"] = data[branch.name]["holdout"]
 
         self._log(f"{self.__class__.__name__} successfully saved.", 1)
 

diff --git a/atom/basetracker.py b/atom/basetracker.py
@@ -19,7 +19,6 @@ class TrackingParams:
     """Tracking parameters for a mlflow experiment."""
 
     log_ht: Bool  # Track every trial of the hyperparameter tuning
-    log_model: Bool  # Save the model estimator after fitting
     log_plots: Bool  # Save plot artifacts
     log_data: Bool  # Save the train and test sets
     log_pipeline: Bool  # Save the model's pipeline
@@ -30,7 +29,6 @@ class BaseTracker:
     # Tracking parameters for mlflow
     _tracking_params = TrackingParams(
         log_ht=True,
-        log_model=True,
         log_plots=True,
         log_data=False,
         log_pipeline=False,
@@ -45,15 +43,6 @@ def log_ht(self) -> Bool:
     def log_ht(self, value: Bool):
         self._tracking_params.log_ht = value
 
-    @property
-    def log_model(self) -> Bool:
-        """Whether to save the model's estimator after fitting."""
-        return self._tracking_params.log_model
-
-    @log_model.setter
-    def log_model(self, value: Bool):
-        self._tracking_params.log_model = value
-
     @property
     def log_plots(self) -> Bool:
         """Whether to save plots as artifacts."""

diff --git a/atom/branch/branch.py b/atom/branch/branch.py
@@ -99,7 +99,7 @@ def __init__(
         if memory.location is None:
             self._location = None
         else:
-            self._location = os.path.join(memory.location, f"joblib/atom/{self}.pkl")
+            self._location = os.path.join(memory.location, "joblib", "atom", str(self))
 
     def __repr__(self) -> str:
         return f"Branch({self.name})"
@@ -109,7 +109,7 @@ def _data(self) -> DataFrame | None:
         """Get the branch's data.
 
         Load from memory if the data container is empty. This property
-        is required to access the data from inactive branches.
+        is required to access the data for inactive branches.
 
         """
         return self.load(assign=False)
@@ -304,7 +304,6 @@ def holdout(self) -> DataFrame | None:
                 *self.pipeline.transform(
                     X=self._holdout.iloc[:, :-self._data.n_cols],
                     y=self._holdout[self.target],
-                    verbose=0,
                 )
             )
 
@@ -500,7 +499,7 @@ def _get_columns(
 
         Parameters
         ----------
-        columns: int, str, range, slice, sequence or None
+        columns: int, str, range, slice, sequence or None, default=None
             Names, indices or dtypes of the columns to select. If None,
             it returns all columns in the dataframe.
 
@@ -731,10 +730,10 @@ def load(self, assign: Bool = True) -> DataContainer | None:
         """
         if self._container is None and self._location:
             try:
-                with open(self._location, "rb") as file:
+                with open(f"{self._location}.pkl", "rb") as file:
                     data = pickle.load(file)
             except FileNotFoundError:
-                raise ValueError(f"Branch {self.name} has no data.")
+                raise ValueError(f"Branch {self.name} has no data stored.")
 
             if assign:
                 self._container = data
@@ -743,7 +742,7 @@ def load(self, assign: Bool = True) -> DataContainer | None:
 
         return self._container
 
-    def store(self):
+    def store(self, assign: Bool = True):
         """Store the branch's data as a pickle in memory.
 
         After storage, the data is deleted and the branch is no longer
@@ -754,9 +753,15 @@ def store(self):
             This method is skipped silently for branches with no memory
             allocation.
 
+        Parameters
+        ----------
+        assign: bool, default=True
+            Whether to assign `None` to the data in `self`.
+
         """
-        if self._location:
-            with open(self._location, "wb") as file:
+        if self._container is not None and self._location:
+            with open(f"{self._location}.pkl", "wb") as file:
                 pickle.dump(self._container, file)
 
-            self._container = None
+            if assign:
+                self._container = None
diff --git a/atom/branch/branchmanager.py b/atom/branch/branchmanager.py
@@ -10,7 +10,7 @@
 from __future__ import annotations
 
 from copy import copy, deepcopy
-
+import shutil
 from beartype import beartype
 from joblib.memory import Memory
 
@@ -145,17 +145,25 @@ def _copy_from_parent(branch: Branch, parent: Branch):
             Parent branch from which to get the info from.
 
         """
-        # Transfer data from parent or load from memory
-        if parent._data is None:
+        if branch.name == "og" and parent._location:
+            # Make a new copy of the data for the og branch
+            parent.store(assign=False)
+            shutil.copy(f"{parent._location}.pkl", f"{branch._location}.pkl")
+        elif parent._location:
+            # Transfer data from memory to avoid having
+            # the datasets in memory twice at one time
+            parent.store()
             setattr(branch, "_data", parent.load(assign=False))
         else:
+            # Copy the dataset in-memory
             setattr(branch, "_data", deepcopy(parent._data))
 
         # Deepcopy the pipeline but use the same estimators
         setattr(branch, "_pipeline", deepcopy(getattr(parent, "_pipeline")))
         for i, step in enumerate(parent._pipeline.steps):
             branch.pipeline.steps[i] = step
 
+        # Copy mapping and assign other vars
         setattr(branch, "_mapping", copy(getattr(parent, "_mapping")))
         for attr in vars(parent):
             if not hasattr(branch, attr):  # If not already assigned...
@@ -180,16 +188,16 @@ def add(self, name: str, parent: Branch | None = None):
         if name == "og":
             if not self._og:
                 self._og = Branch("og", memory=self.memory)
-                self._copy_from_parent(self._og, self.current)
+                self._copy_from_parent(self.og, self.current)
         else:
             # Skip for first call from __init__
             if self.branches:
                 self.current.store()
 
             self._current = self.branches.append(Branch(name, memory=self.memory))
 
-            if parent is not None:
-                self._copy_from_parent(self._current, parent)
+            if parent:
+                self._copy_from_parent(self.current, parent)
 
     def fill(self, data: DataContainer, holdout: DataFrame | None = None):
         """Fill the current branch with data.

diff --git a/atom/utils/utils.py b/atom/utils/utils.py
@@ -482,7 +482,7 @@ def __call__(self, study: Study, trial: FrozenTrial):
                     for i, name in enumerate(self.T._metric.keys()):
                         mlflow.log_metric(f"{name}_validation", score[i])
 
-                    if estimator and self.T.log_model:
+                    if estimator:
                         mlflow.sklearn.log_model(
                             sk_model=estimator,
                             artifact_path=estimator.__class__.__name__,
@@ -1507,9 +1507,7 @@ def check_scaling(X: Pandas, pipeline: Any | None = None) -> bool:
         has_scaler = any("scaler" in name.lower() for name in pipeline.named_steps)
 
     df = to_df(X)  # Convert to dataframe
-
-    # Remove binary columns (thus also sparse columns)
-    df = df[[c for c in df if ~np.isin(df[c].unique(), [0, 1]).all()]]
+    df = df.loc[:, (~df.isin([0, 1])).any(axis=0)]  # Remove binary columns
 
     if df.empty:  # All columns are binary -> no scaling needed
         return True

diff --git a/codecov.yml b/codecov.yml