Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Loss Function to Improve Model Convergence for AutoEncoder #1460

Merged
merged 11 commits into from
Jan 22, 2024
74 changes: 42 additions & 32 deletions morpheus/models/dfencoder/autoencoder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -626,8 +626,10 @@ def preprocess_data(
return preprocessed_data

def compute_loss(self, num, bin, cat, target_df, should_log=True, _id=False):

num_target, bin_target, codes = self.compute_targets(target_df)
return self.compute_loss_from_targets(

mse, bce, cce, net = self.compute_loss_from_targets(
num=num,
bin=bin,
cat=cat,
Expand All @@ -638,6 +640,10 @@ def compute_loss(self, num, bin, cat, target_df, should_log=True, _id=False):
_id=_id,
)

net = net.cpu().item()

return mse, bce, cce, net

def compute_loss_from_targets(self, num, bin, cat, num_target, bin_target, cat_target, should_log=True, _id=False):
"""Computes the loss from targets.

Expand Down Expand Up @@ -670,38 +676,41 @@ def compute_loss_from_targets(self, num, bin, cat, num_target, bin_target, cat_t
should_log = True
else:
should_log = False
net_loss = []
mse_loss = self.mse(num, num_target)
net_loss += list(mse_loss.mean(dim=0).cpu().detach().numpy())
mse_loss = mse_loss.mean()
bce_loss = self.bce(bin, bin_target)

net_loss += list(bce_loss.mean(dim=0).cpu().detach().numpy())
bce_loss = bce_loss.mean()
cce_loss = []
for i, ft in enumerate(self.categorical_fts):
loss = self.cce(cat[i], cat_target[i])
loss = loss.mean()
cce_loss.append(loss)
val = loss.cpu().item()
net_loss += [val]
# Calculate the numerical loss (per feature)
mse_loss: torch.Tensor = self.mse(num, num_target).mean(dim=0)

# Calculate the binary loss (per feature)
bce_loss: torch.Tensor = self.bce(bin, bin_target).mean(dim=0)

# To calc the categorical loss, we need to average the loss of each categorical feature independently (since
# they will have a different number of categories)
cce_loss_list = []

for i in range(len(self.categorical_fts)):
# Take the full mean but ensure the output is a 1x1 tensor to make it easier to concatenate
cce_loss_list.append(self.cce(cat[i], cat_target[i]).mean(dim=0, keepdim=True))

if (len(cce_loss_list) > 0):
cce_loss = torch.cat(cce_loss_list)
else:
cce_loss = torch.Tensor().to(self.device)

# The net loss should have one loss per feature
net_loss = torch.cat((mse_loss, bce_loss, cce_loss))

if should_log:
# Convert it to a list of numpy
net_loss_list = net_loss.tolist()

if self.training:
self.logger.training_step(net_loss)
self.logger.training_step(net_loss_list)
elif _id:
self.logger.id_val_step(net_loss)
self.logger.id_val_step(net_loss_list)
elif not self.training:
self.logger.val_step(net_loss)

net_loss = np.array(net_loss).mean()
return mse_loss, bce_loss, cce_loss, net_loss
self.logger.val_step(net_loss_list)

def do_backward(self, mse, bce, cce):
# running `backward()` seperately on mse/bce/cce is equivalent to summing them up and run `backward()` once
loss_fn = mse + bce
for ls in cce:
loss_fn += ls
loss_fn.backward()
return mse_loss.mean(), bce_loss.mean(), cce_loss.mean(), net_loss.mean()

def compute_baseline_performance(self, in_, out_):
"""
Expand Down Expand Up @@ -729,6 +738,7 @@ def compute_baseline_performance(self, in_, out_):
codes_pred.append(pred)
mse_loss, bce_loss, cce_loss, net_loss = self.compute_loss(num_pred, bin_pred, codes_pred, out_,
should_log=False)

if isinstance(self.logger, BasicLogger):
self.logger.baseline_loss = net_loss
return net_loss
Expand Down Expand Up @@ -981,11 +991,11 @@ def _fit_batch(self, input_swapped, num_target, bin_target, cat_target, **kwargs
cat_target=cat_target,
should_log=True,
)
self.do_backward(mse, bce, cce)
net_loss.backward()
self.optim.step()
self.optim.zero_grad()

return net_loss
return net_loss.cpu().item()

def _compute_baseline_performance_from_dataset(self, validation_dataset):
self.eval()
Expand Down Expand Up @@ -1028,7 +1038,7 @@ def _compute_batch_baseline_performance(
cat_target=cat_target,
should_log=False
)
return net_loss
return net_loss.cpu().item()

def _validate_dataset(self, validation_dataset, rank=None):
"""Runs a validation loop on the given validation dataset, computing and returning the average loss of both the original
Expand Down Expand Up @@ -1108,7 +1118,7 @@ def _validate_batch(self, input_original, input_swapped, num_target, bin_target,
cat_target=cat_target,
should_log=True,
)
return orig_net_loss, net_loss
return orig_net_loss.cpu().item(), net_loss.cpu().item()

def _populate_loss_stats_from_dataset(self, dataset):
"""Populates the `self.feature_loss_stats` dict with feature losses computed using the provided dataset.
Expand Down
19 changes: 18 additions & 1 deletion tests/dfencoder/test_autoencoder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -18,6 +18,7 @@
import typing
from unittest.mock import patch

import numpy as np
import pandas as pd
import pytest
import torch
Expand Down Expand Up @@ -483,3 +484,19 @@ def test_auto_encoder_get_results(train_ae: autoencoder.AutoEncoder, train_df: p
# Numpy float has different precision checks than python float, so we wrap it.
assert round(float(results.loc[0, 'mean_abs_z']), 3) == 0.335
assert results.loc[0, 'z_loss_scaler_type'] == 'z'


@pytest.mark.usefixtures("manual_seed")
def test_auto_encoder_num_only_convergence(train_ae: autoencoder.AutoEncoder):
num_df = pd.DataFrame({
'num_feat_1': [5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9],
'num_feat_2': [3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1],
})

train_ae.fit(num_df, epochs=50)

avg_loss = np.sum([np.array(loss[1])
for loss in train_ae.logger.train_fts.values()], axis=0) / len(train_ae.logger.train_fts)

# Make sure the model converges with numerical feats only
assert avg_loss[-1] < avg_loss[0] / 2
5 changes: 3 additions & 2 deletions tests/dfencoder/test_dfencoder_distributed_e2e.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -42,6 +42,7 @@
"log_count",
"location_incr",
"app_incr",
"has_error",
]
LOSS_TYPES = ["train", "val", "id_val"]
# 75th quantile of the losses from 100 times of offline training
Expand Down Expand Up @@ -146,7 +147,7 @@ def _run_test(rank, world_size):
min_cats=1,
device=rank,
preset_numerical_scaler_params=preset_numerical_scaler_params,
binary_feature_list=[],
binary_feature_list=['has_error'],
preset_cats=preset_cats,
eval_batch_size=1024,
patience=5,
Expand Down
Loading