Skip to content

Commit

Permalink
memory considerations finished
Browse files Browse the repository at this point in the history
  • Loading branch information
tvdboom committed Sep 14, 2023
1 parent 620f2eb commit 0491ca0
Show file tree
Hide file tree
Showing 329 changed files with 5,360,044 additions and 369 deletions.
21 changes: 0 additions & 21 deletions LICENSE

This file was deleted.

9 changes: 5 additions & 4 deletions atom/atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def outliers(self) -> pd.Series | None:
"""Columns in training set with number of outlier values."""
if not is_sparse(self.X):
data = self.train.select_dtypes(include=["number"])
z_scores = (np.abs(stats.zscore(data.values.astype(float))) > 3)
z_scores = (np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3)
z_scores = pd.Series(z_scores.sum(axis=0), index=data.columns)
return z_scores[z_scores > 0]

Expand All @@ -279,7 +279,8 @@ def n_outliers(self) -> Int | None:
"""Number of samples in the training set containing outliers."""
if not is_sparse(self.X):
data = self.train.select_dtypes(include=["number"])
return (np.abs(stats.zscore(data.values.astype(float))) > 3).any(axis=1).sum()
z_scores = (np.abs(stats.zscore(data.to_numpy(float, na_value=np.nan))) > 3)
return z_scores.any(axis=1).sum()

@property
def classes(self) -> pd.DataFrame | None:
Expand Down Expand Up @@ -870,8 +871,8 @@ def get_data(new_t: str) -> Series:
)

if self.engine.get("data") == "pyarrow":
self.branch.dataset = self.branch.dataset.astype(
{name: to_pyarrow(col) for name, col in self.branch._data.items()}
self.dataset = self.dataset.astype(

Check notice on line 874 in atom/atom.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

An instance attribute is defined outside `__init__`

Instance attribute dataset defined outside __init__
{name: to_pyarrow(col) for name, col in self.dataset.items()}
)

self._log("The column dtypes are successfully converted.", 1)
Expand Down
23 changes: 11 additions & 12 deletions atom/basemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1199,19 +1199,18 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
for step in range(len(value)):
mlflow.log_metric(f"evals_{key}", value[step], step=step)

# Rest of metrics are tracked when calling _get_score
# The Rest of the metrics are tracked when calling _get_score
mlflow.log_metric("time_fit", self.time_fit)

if self.log_model:
mlflow.sklearn.log_model(
sk_model=self.estimator,
artifact_path=self._est_class.__name__,
signature=infer_signature(
model_input=pd.DataFrame(self.X),
model_output=self.predict_test.to_numpy(),
),
input_example=pd.DataFrame(self.X.iloc[[0], :]),
)
mlflow.sklearn.log_model(
sk_model=self.estimator,
artifact_path=self._est_class.__name__,
signature=infer_signature(
model_input=pd.DataFrame(self.X),
model_output=self.predict_test.to_numpy(),
),
input_example=pd.DataFrame(self.X.iloc[[0], :]),
)

if self.log_data:
for ds in ("train", "test"):
Expand All @@ -1223,7 +1222,7 @@ def fit(self, X: DataFrame | None = None, y: Pandas | None = None):
if self.log_pipeline:
mlflow.sklearn.log_model(
sk_model=self.export_pipeline(),
artifact_path=f"{self.name}_pipeline",
artifact_path=f"{self._est_class.__name__}_pipeline",
signature=infer_signature(
model_input=pd.DataFrame(self.X),
model_output=self.predict_test.to_numpy(),
Expand Down
9 changes: 4 additions & 5 deletions atom/baserunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,19 +731,18 @@ def save(self, filename: str = "auto", *, save_data: Bool = True):
method to reload the instance.
"""
if not save_data and hasattr(self, "_branches"):
if not save_data:
data = {}
og = self._branches.og._data

Check notice on line 736 in atom/baserunner.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _data of a class
self._branches._og._data = None

Check notice on line 737 in atom/baserunner.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _og of a class
for branch in self._branches:
data[branch.name] = dict(
_data=deepcopy(branch._data),
_holdout=deepcopy(branch._holdout),
holdout=branch.__dict__.get("holdout", None)
holdout=branch.__dict__.pop("holdout", None) # Clear cached holdout
)
branch._data = None
branch._holdout = None
branch.__dict__.pop("holdout", None) # Clear cached holdout

if filename.endswith("auto"):
filename = filename.replace("auto", self.__class__.__name__)
Expand All @@ -756,13 +755,13 @@ def save(self, filename: str = "auto", *, save_data: Bool = True):
pickle.dump(self, f)

# Restore the data to the attributes
if not save_data and hasattr(self, "_branches"):
if not save_data:
self._branches._og._data = og

Check notice on line 759 in atom/baserunner.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _og of a class

Check warning on line 759 in atom/baserunner.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Unbound local variables

Local variable 'og' might be referenced before assignment
for branch in self._branches:
branch._data = data[branch.name]["_data"]

Check warning on line 761 in atom/baserunner.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Unbound local variables

Local variable 'data' might be referenced before assignment
branch._holdout = data[branch.name]["_holdout"]
if data[branch.name]["holdout"] is not None:
branch.holdout = data[branch.name]["holdout"]
branch.__dict__["holdout"] = data[branch.name]["holdout"]

self._log(f"{self.__class__.__name__} successfully saved.", 1)

Expand Down
11 changes: 0 additions & 11 deletions atom/basetracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ class TrackingParams:
"""Tracking parameters for a mlflow experiment."""

log_ht: Bool # Track every trial of the hyperparameter tuning
log_model: Bool # Save the model estimator after fitting
log_plots: Bool # Save plot artifacts
log_data: Bool # Save the train and test sets
log_pipeline: Bool # Save the model's pipeline
Expand All @@ -30,7 +29,6 @@ class BaseTracker:
# Tracking parameters for mlflow
_tracking_params = TrackingParams(
log_ht=True,
log_model=True,
log_plots=True,
log_data=False,
log_pipeline=False,
Expand All @@ -45,15 +43,6 @@ def log_ht(self) -> Bool:
def log_ht(self, value: Bool):
self._tracking_params.log_ht = value

@property
def log_model(self) -> Bool:
"""Whether to save the model's estimator after fitting."""
return self._tracking_params.log_model

@log_model.setter
def log_model(self, value: Bool):
self._tracking_params.log_model = value

@property
def log_plots(self) -> Bool:
"""Whether to save plots as artifacts."""
Expand Down
25 changes: 15 additions & 10 deletions atom/branch/branch.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def __init__(
if memory.location is None:
self._location = None
else:
self._location = os.path.join(memory.location, f"joblib/atom/{self}.pkl")
self._location = os.path.join(memory.location, "joblib", "atom", str(self))

def __repr__(self) -> str:
return f"Branch({self.name})"
Expand All @@ -109,7 +109,7 @@ def _data(self) -> DataFrame | None:
"""Get the branch's data.
Load from memory if the data container is empty. This property
is required to access the data from inactive branches.
is required to access the data for inactive branches.
"""
return self.load(assign=False)
Expand Down Expand Up @@ -304,7 +304,6 @@ def holdout(self) -> DataFrame | None:
*self.pipeline.transform(
X=self._holdout.iloc[:, :-self._data.n_cols],
y=self._holdout[self.target],
verbose=0,
)
)

Expand Down Expand Up @@ -500,7 +499,7 @@ def _get_columns(
Parameters
----------
columns: int, str, range, slice, sequence or None
columns: int, str, range, slice, sequence or None, default=None
Names, indices or dtypes of the columns to select. If None,
it returns all columns in the dataframe.
Expand Down Expand Up @@ -731,10 +730,10 @@ def load(self, assign: Bool = True) -> DataContainer | None:
"""
if self._container is None and self._location:
try:
with open(self._location, "rb") as file:
with open(f"{self._location}.pkl", "rb") as file:
data = pickle.load(file)
except FileNotFoundError:
raise ValueError(f"Branch {self.name} has no data.")
raise ValueError(f"Branch {self.name} has no data stored.")

if assign:
self._container = data
Expand All @@ -743,7 +742,7 @@ def load(self, assign: Bool = True) -> DataContainer | None:

return self._container

def store(self):
def store(self, assign: Bool = True):
"""Store the branch's data as a pickle in memory.
After storage, the data is deleted and the branch is no longer
Expand All @@ -754,9 +753,15 @@ def store(self):
This method is skipped silently for branches with no memory
allocation.
Parameters
----------
assign: bool, default=True
Whether to assign `None` to the data in `self`.
"""
if self._location:
with open(self._location, "wb") as file:
if self._container is not None and self._location:
with open(f"{self._location}.pkl", "wb") as file:
pickle.dump(self._container, file)

self._container = None
if assign:
self._container = None
20 changes: 14 additions & 6 deletions atom/branch/branchmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from __future__ import annotations

from copy import copy, deepcopy

import shutil
from beartype import beartype
from joblib.memory import Memory

Expand Down Expand Up @@ -145,17 +145,25 @@ def _copy_from_parent(branch: Branch, parent: Branch):
Parent branch from which to get the info from.
"""
# Transfer data from parent or load from memory
if parent._data is None:
if branch.name == "og" and parent._location:

Check notice on line 148 in atom/branch/branchmanager.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _location of a class
# Make a new copy of the data for the og branch
parent.store(assign=False)
shutil.copy(f"{parent._location}.pkl", f"{branch._location}.pkl")

Check notice on line 151 in atom/branch/branchmanager.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _location of a class

Check notice on line 151 in atom/branch/branchmanager.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _location of a class
elif parent._location:

Check notice on line 152 in atom/branch/branchmanager.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _location of a class
# Transfer data from memory to avoid having
# the datasets in memory twice at one time
parent.store()
setattr(branch, "_data", parent.load(assign=False))
else:
# Copy the dataset in-memory
setattr(branch, "_data", deepcopy(parent._data))

Check notice on line 159 in atom/branch/branchmanager.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _data of a class

# Deepcopy the pipeline but use the same estimators
setattr(branch, "_pipeline", deepcopy(getattr(parent, "_pipeline")))
for i, step in enumerate(parent._pipeline.steps):

Check notice on line 163 in atom/branch/branchmanager.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _pipeline of a class
branch.pipeline.steps[i] = step

# Copy mapping and assign other vars
setattr(branch, "_mapping", copy(getattr(parent, "_mapping")))
for attr in vars(parent):
if not hasattr(branch, attr): # If not already assigned...
Expand All @@ -180,16 +188,16 @@ def add(self, name: str, parent: Branch | None = None):
if name == "og":
if not self._og:
self._og = Branch("og", memory=self.memory)
self._copy_from_parent(self._og, self.current)
self._copy_from_parent(self.og, self.current)
else:
# Skip for first call from __init__
if self.branches:
self.current.store()

self._current = self.branches.append(Branch(name, memory=self.memory))

Check notice on line 197 in atom/branch/branchmanager.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

An instance attribute is defined outside `__init__`

Instance attribute _current defined outside __init__

if parent is not None:
self._copy_from_parent(self._current, parent)
if parent:
self._copy_from_parent(self.current, parent)

def fill(self, data: DataContainer, holdout: DataFrame | None = None):
"""Fill the current branch with data.
Expand Down
6 changes: 2 additions & 4 deletions atom/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def __call__(self, study: Study, trial: FrozenTrial):
for i, name in enumerate(self.T._metric.keys()):

Check notice on line 482 in atom/utils/utils.py

View workflow job for this annotation

GitHub Actions / Qodana Community for Python

Accessing a protected member of a class or a module

Access to a protected member _metric of a class
mlflow.log_metric(f"{name}_validation", score[i])

if estimator and self.T.log_model:
if estimator:
mlflow.sklearn.log_model(
sk_model=estimator,
artifact_path=estimator.__class__.__name__,
Expand Down Expand Up @@ -1507,9 +1507,7 @@ def check_scaling(X: Pandas, pipeline: Any | None = None) -> bool:
has_scaler = any("scaler" in name.lower() for name in pipeline.named_steps)

df = to_df(X) # Convert to dataframe

# Remove binary columns (thus also sparse columns)
df = df[[c for c in df if ~np.isin(df[c].unique(), [0, 1]).all()]]
df = df.loc[:, (~df.isin([0, 1])).any(axis=0)] # Remove binary columns

if df.empty: # All columns are binary -> no scaling needed
return True
Expand Down
5 changes: 0 additions & 5 deletions codecov.yml

This file was deleted.

Loading

0 comments on commit 0491ca0

Please sign in to comment.