Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
History
=======

0.1.8 (2024-08-29)
0.1.10 (2024-??-??)
------------------
* Long EM and RPCA operations wrapped with tqdm progress bars
* Readme code sample updated, and results table made consistant

0.1.9 (2024-08-29)
------------------
* Tutorials reproducibility improved with random_state parameters
* RPCA now accepts random_state parameters
Expand Down
8 changes: 4 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,26 +70,26 @@ With just these few lines of code, you can see how easy it is to
from qolmat.utils import data

# load and prepare csv data

df_data = data.get_data("Beijing")
columns = ["TEMP", "PRES", "WSPM"]
df_data = df_data[columns]
df_with_nan = data.add_holes(df_data, ratio_masked=0.2, mean_size=120)

# impute and compare
imputer_mean = imputers.ImputerSimple(strategy="mean", groups=("station",))
imputer_median = imputers.ImputerSimple(groups=("station",))
imputer_interpol = imputers.ImputerInterpolation(method="linear", groups=("station",))
imputer_var1 = imputers.ImputerEM(model="VAR", groups=("station",), method="mle", max_iter_em=50, n_iter_ou=15, dt=1e-3, p=1)
dict_imputers = {
"mean": imputer_mean,
"median": imputer_median,
"interpolation": imputer_interpol,
"VAR(1) process": imputer_var1
}
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=4, ratio_masked=0.1)
comparison = comparator.Comparator(
dict_imputers,
columns,
generator_holes = generator_holes,
metrics = ["mae", "wmape", "kl_columnwise", "ks_test", "energy"],
metrics = ["mae", "wmape", "kl_columnwise", "frechet"],
)
results = comparison.compare(df_with_nan)
results.style.highlight_min(color="lightsteelblue", axis=1)
Expand Down
Binary file modified docs/images/readme_tabular_comparison.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 0 additions & 1 deletion examples/tutorials/plot_tuto_benchmark_TS.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@

comparison = comparator.Comparator(
dict_imputers,
cols_to_impute,
generator_holes=generator_holes,
metrics=["mae", "wmape", "kl_columnwise", "wasserstein_columnwise"],
max_evals=10,
Expand Down
1 change: 0 additions & 1 deletion examples/tutorials/plot_tuto_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@

comparison = comparator.Comparator(
dict_imputers,
cols_to_impute,
generator_holes=generator_holes,
metrics=metrics,
max_evals=2,
Expand Down
2 changes: 0 additions & 2 deletions examples/tutorials/plot_tuto_diffusion_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@

comparison = comparator.Comparator(
dict_imputers,
selected_columns=df_data.columns,
generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng),
metrics=["mae", "kl_columnwise"],
)
Expand Down Expand Up @@ -224,7 +223,6 @@

comparison = comparator.Comparator(
dict_imputers,
selected_columns=df_data.columns,
generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng),
metrics=["mae", "kl_columnwise"],
)
Expand Down
1 change: 0 additions & 1 deletion examples/tutorials/plot_tuto_mean_median.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@

comparison = comparator.Comparator(
dict_imputers,
cols_to_impute,
generator_holes=generator_holes,
metrics=metrics,
max_evals=5,
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ statsmodels = ">= 0.14.0"
typed-ast = { version = "*", optional = true }
category-encoders = "^2.6.3"
dcor = ">= 0.6"
tqdm = "*"

[tool.poetry.group.torch.dependencies]
torch = "< 2.5"
Expand Down
5 changes: 0 additions & 5 deletions qolmat/benchmark/comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@ class Comparator:
----------
dict_models: Dict[str, any]
dictionary of imputation methods
selected_columns: List[str]Œ
list of column's names selected (all with at least one null value will
be imputed)
columnwise_evaluation : Optional[bool], optional
whether the metric should be calculated column-wise or not,
by default False
Expand All @@ -46,7 +43,6 @@ class Comparator:
def __init__(
self,
dict_models: Dict[str, Any],
selected_columns: List[str],
generator_holes: _HoleGenerator,
metrics: List = ["mae", "wmape", "kl_columnwise"],
dict_config_opti: Optional[Dict[str, Any]] = {},
Expand All @@ -55,7 +51,6 @@ def __init__(
verbose: bool = False,
):
self.dict_imputers = dict_models
self.selected_columns = selected_columns
self.generator_holes = generator_holes
self.metrics = metrics
self.dict_config_opti = dict_config_opti
Expand Down
13 changes: 11 additions & 2 deletions qolmat/benchmark/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,7 @@ def sum_pairwise_distances(
def frechet_distance_base(
df1: pd.DataFrame,
df2: pd.DataFrame,
df_mask: pd.DataFrame,
) -> pd.Series:
"""Compute the Fréchet distance between two dataframes df1 and df2.

Expand All @@ -853,16 +854,24 @@ def frechet_distance_base(
true dataframe
df2 : pd.DataFrame
predicted dataframe
df_mask : pd.DataFrame
Elements of the dataframes to compute on

Returns
-------
pd.Series
Frechet distance in a Series object

"""
if df1.shape != df2.shape:
if df1.shape != df2.shape or df1.shape != df_mask.shape:
raise Exception("inputs have to be of same dimensions.")

df1 = df1.copy()
df2 = df2.copy()
# Set to nan the values not in the mask
df1[~df_mask] = np.nan
df2[~df_mask] = np.nan

std = (np.std(df1) + np.std(df2) + EPS) / 2
mu = (np.nanmean(df1, axis=0) + np.nanmean(df2, axis=0)) / 2
df1 = (df1 - mu) / std
Expand Down Expand Up @@ -911,7 +920,7 @@ def frechet_distance(

"""
if method == "single":
return frechet_distance_base(df1, df2)
return frechet_distance_base(df1, df2, df_mask)
return pattern_based_weighted_mean_metric(
df1,
df2,
Expand Down
8 changes: 7 additions & 1 deletion qolmat/imputations/em_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from scipy import optimize as spo
from sklearn import utils as sku
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm

from qolmat.utils import utils
from qolmat.utils.utils import RandomSetting
Expand Down Expand Up @@ -433,7 +434,11 @@ def fit_X(self, X: NDArray) -> None:

X = self._maximize_likelihood(X_imp, mask_na)

for iter_em in range(self.max_iter_em):
for iter_em in tqdm(
range(self.max_iter_em),
desc="EM parameters estimation",
disable=not self.verbose,
):
X = self._sample_ou(X, mask_na)

self.combine_parameters()
Expand Down Expand Up @@ -474,6 +479,7 @@ def fit(self, X: NDArray) -> "EM":
if hasattr(self, "p_to_fit") and self.p_to_fit:
aics: List[float] = []
for p in range(self.max_lagp + 1):
print("p=", p)
self.p = p
self.fit_X(X)
n1, n2 = self.X.shape
Expand Down
41 changes: 24 additions & 17 deletions qolmat/imputations/imputers_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
from numpy.typing import NDArray
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# from typing_extensions import Self
from qolmat.benchmark import metrics
Expand Down Expand Up @@ -106,23 +107,29 @@ def _fit_estimator(
optimizer = optim.Adam(estimator.parameters(), lr=self.learning_rate)
loss_fn = self.loss_fn

for epoch in range(self.epochs):
estimator.train()
optimizer.zero_grad()

input_data = torch.Tensor(X.values)
target_data = torch.Tensor(y.values)
target_data = target_data.unsqueeze(1)
outputs = estimator(input_data)
loss = loss_fn(outputs, target_data)

loss.backward()
optimizer.step()
if (epoch + 1) % 10 == 0:
logging.info(
f"Epoch [{epoch + 1}/{self.epochs}], "
f"Loss: {loss.item():.4f}"
)
# if X.shape[0] != estimator[0].in_features:
# raise ValueError(
# "The number of features in X does not match the input "
# "features of the estimator. The estimator expects"
# f" {estimator[0].in_features} features, but X has "
# f"{X.shape[0]} features."
# )

with tqdm(total=self.epochs, desc="Training", unit="epoch") as pbar:
for _ in range(self.epochs):
estimator.train()
optimizer.zero_grad()

input_data = torch.Tensor(X.values)
target_data = torch.Tensor(y.values)
target_data = target_data.unsqueeze(1)
outputs = estimator(input_data)
loss = loss_fn(outputs, target_data)

loss.backward()
optimizer.step()
pbar.set_postfix(loss=f"{loss.item():.4f}")
pbar.update(1)
return estimator

def _predict_estimator(
Expand Down
Loading
Loading