Add sigmoid/softmax interface for AsymmetricUnifiedFocalLoss

ytl0623 · ytl0623 · commit c27945aaf670 · 2025-12-22T12:04:43.000+08:00
Signed-off-by: ytl0623 &lt;david89062388@gmail.com&gt;
diff --git a/monai/losses/unified_focal_loss.py b/monai/losses/unified_focal_loss.py
@@ -12,19 +12,20 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Sequence
 
 import torch
+import torch.nn.functional as F
 from torch.nn.modules.loss import _Loss
 
+from monai.losses import FocalLoss
 from monai.networks import one_hot
 from monai.utils import LossReduction
 
 
 class AsymmetricFocalTverskyLoss(_Loss):
     """
-    AsymmetricFocalTverskyLoss is a variant of FocalTverskyLoss, which attentions to the foreground class.
-
-    Actually, it's only supported for binary image segmentation now.
+    AsymmetricFocalTverskyLoss is a variant of FocalTverskyLoss, which focuses on the foreground class.
 
     Reimplementation of the Asymmetric Focal Tversky Loss described in:
 
@@ -34,6 +35,7 @@ class AsymmetricFocalTverskyLoss(_Loss):
 
     def __init__(
         self,
+        include_background: bool = True,
         to_onehot_y: bool = False,
         delta: float = 0.7,
         gamma: float = 0.75,
@@ -42,18 +44,27 @@ def __init__(
     ) -> None:
         """
         Args:
+            include_background: if False, channel index 0 (background category) is excluded from the loss calculation.
             to_onehot_y: whether to convert `y` into the one-hot format. Defaults to False.
-            delta : weight of the background. Defaults to 0.7.
-            gamma : value of the exponent gamma in the definition of the Focal loss  . Defaults to 0.75.
-            epsilon : it defines a very small number each time. simmily smooth value. Defaults to 1e-7.
+            delta: weight of the background. Defaults to 0.7.
+            gamma: value of the exponent gamma in the definition of the Focal loss. Defaults to 0.75.
+            epsilon: a small number to avoid division by zero. Defaults to 1e-7.
+            reduction: {``"none"``, ``"mean"``, ``"sum"``}
+                Specifies the reduction to apply to the output. Defaults to ``"mean"``.
         """
         super().__init__(reduction=LossReduction(reduction).value)
+        self.include_background = include_background
         self.to_onehot_y = to_onehot_y
         self.delta = delta
         self.gamma = gamma
         self.epsilon = epsilon
 
     def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            y_pred: the shape should be BNH[WD], where N is the number of classes.
+            y_true: the shape should be BNH[WD], where N is the number of classes.
+        """
         n_pred_ch = y_pred.shape[1]
 
         if self.to_onehot_y:
@@ -62,179 +73,122 @@ def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
             else:
                 y_true = one_hot(y_true, num_classes=n_pred_ch)
 
+        if not self.include_background:
+            if n_pred_ch == 1:
+                warnings.warn("single channel prediction, `include_background=False` ignored.")
+            else:
+                # if skipping background, removing first channel
+                y_true = y_true[:, 1:]
+                y_pred = y_pred[:, 1:]
+
         if y_true.shape != y_pred.shape:
             raise ValueError(f"ground truth has different shape ({y_true.shape}) from input ({y_pred.shape})")
 
         # clip the prediction to avoid NaN
         y_pred = torch.clamp(y_pred, self.epsilon, 1.0 - self.epsilon)
-        axis = list(range(2, len(y_pred.shape)))
 
         # Calculate true positives (tp), false negatives (fn) and false positives (fp)
+        # Sum over spatial dimensions (B, C, H, W, D) -> (B, C)
+        axis = list(range(2, len(y_pred.shape)))
         tp = torch.sum(y_true * y_pred, dim=axis)
         fn = torch.sum(y_true * (1 - y_pred), dim=axis)
         fp = torch.sum((1 - y_true) * y_pred, dim=axis)
-        dice_class = (tp + self.epsilon) / (tp + self.delta * fn + (1 - self.delta) * fp + self.epsilon)
 
-        # Calculate losses separately for each class, enhancing both classes
-        back_dice = 1 - dice_class[:, 0]
-        fore_dice = (1 - dice_class[:, 1]) * torch.pow(1 - dice_class[:, 1], -self.gamma)
-
-        # Average class scores
-        loss = torch.mean(torch.stack([back_dice, fore_dice], dim=-1))
-        return loss
-
-
-class AsymmetricFocalLoss(_Loss):
-    """
-    AsymmetricFocalLoss is a variant of FocalTverskyLoss, which attentions to the foreground class.
-
-    Actually, it's only supported for binary image segmentation now.
-
-    Reimplementation of the Asymmetric Focal Loss described in:
-
-    - "Unified Focal Loss: Generalising Dice and Cross Entropy-based Losses to Handle Class Imbalanced Medical Image Segmentation",
-    Michael Yeung, Computerized Medical Imaging and Graphics
-    """
+        dice_class = (tp + self.epsilon) / (tp + self.delta * fn + (1 - self.delta) * fp + self.epsilon)
 
-    def __init__(
-        self,
-        to_onehot_y: bool = False,
-        delta: float = 0.7,
-        gamma: float = 2,
-        epsilon: float = 1e-7,
-        reduction: LossReduction | str = LossReduction.MEAN,
-    ):
-        """
-        Args:
-            to_onehot_y : whether to convert `y` into the one-hot format. Defaults to False.
-            delta : weight of the background. Defaults to 0.7.
-            gamma : value of the exponent gamma in the definition of the Focal loss  . Defaults to 0.75.
-            epsilon : it defines a very small number each time. simmily smooth value. Defaults to 1e-7.
-        """
-        super().__init__(reduction=LossReduction(reduction).value)
-        self.to_onehot_y = to_onehot_y
-        self.delta = delta
-        self.gamma = gamma
-        self.epsilon = epsilon
+        # Calculate losses separately for each class
+        # Background (index 0) treated normally: 1 - dice
+        # Foreground (index > 0) treated with focal modulation: (1 - dice)^(1-gamma)
 
-    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
-        n_pred_ch = y_pred.shape[1]
+        # Note: If include_background is False, index 0 is actually the first foreground class.
+        # We generally apply the asymmetry between the FIRST channel and the REST.
+        # However, for rigorous multi-class 'Asymmetric' implementation, we assume
+        # class 0 is background (if included) and others are foreground.
 
-        if self.to_onehot_y:
-            if n_pred_ch == 1:
-                warnings.warn("single channel prediction, `to_onehot_y=True` ignored.")
+        loss_list = []
+        for i in range(y_pred.shape[1]):
+            # If this is the background channel (index 0 and included), use standard Dice loss
+            if i == 0 and self.include_background:
+                loss_list.append(1 - dice_class[:, i])
             else:
-                y_true = one_hot(y_true, num_classes=n_pred_ch)
-
-        if y_true.shape != y_pred.shape:
-            raise ValueError(f"ground truth has different shape ({y_true.shape}) from input ({y_pred.shape})")
+                # Foreground classes: apply focal modulation
+                # Original logic: (1 - dice) * (1 - dice)^(-gamma) -> (1-dice)^(1-gamma)
+                loss_list.append((1 - dice_class[:, i]) * torch.pow(1 - dice_class[:, i], -self.gamma))
 
-        y_pred = torch.clamp(y_pred, self.epsilon, 1.0 - self.epsilon)
-        cross_entropy = -y_true * torch.log(y_pred)
-
-        back_ce = torch.pow(1 - y_pred[:, 0], self.gamma) * cross_entropy[:, 0]
-        back_ce = (1 - self.delta) * back_ce
+        loss = torch.stack(loss_list, dim=-1)
 
-        fore_ce = cross_entropy[:, 1]
-        fore_ce = self.delta * fore_ce
+        if self.reduction == LossReduction.SUM.value:
+            return loss.sum()
+        if self.reduction == LossReduction.NONE.value:
+            return loss
+        if self.reduction == LossReduction.MEAN.value:
+            return loss.mean()
 
-        loss = torch.mean(torch.sum(torch.stack([back_ce, fore_ce], dim=1), dim=1))
-        return loss
+        raise ValueError(f'Unsupported reduction: {self.reduction}, available options are ["mean", "sum", "none"].')
 
 
 class AsymmetricUnifiedFocalLoss(_Loss):
     """
-    AsymmetricUnifiedFocalLoss is a variant of Focal Loss.
-
-    Actually, it's only supported for binary image segmentation now
-
-    Reimplementation of the Asymmetric Unified Focal Tversky Loss described in:
-
-    - "Unified Focal Loss: Generalising Dice and Cross Entropy-based Losses to Handle Class Imbalanced Medical Image Segmentation",
-    Michael Yeung, Computerized Medical Imaging and Graphics
+    AsymmetricUnifiedFocalLoss is a variant of Focal Loss that combines Focal Loss and
+    Asymmetric Focal Tversky Loss to handle class imbalance.
     """
 
     def __init__(
         self,
-        to_onehot_y: bool = False,
-        num_classes: int = 2,
         weight: float = 0.5,
+        delta: float = 0.6,
         gamma: float = 0.5,
-        delta: float = 0.7,
+        include_background: bool = True,
+        to_onehot_y: bool = False,
+        use_softmax: bool = False,
         reduction: LossReduction | str = LossReduction.MEAN,
-    ):
+    ) -> None:
         """
         Args:
-            to_onehot_y : whether to convert `y` into the one-hot format. Defaults to False.
-            num_classes : number of classes, it only supports 2 now. Defaults to 2.
-            delta : weight of the background. Defaults to 0.7.
-            gamma : value of the exponent gamma in the definition of the Focal loss. Defaults to 0.75.
-            epsilon : it defines a very small number each time. simmily smooth value. Defaults to 1e-7.
-            weight : weight for each loss function, if it's none it's 0.5. Defaults to None.
-
-        Example:
-            >>> import torch
-            >>> from monai.losses import AsymmetricUnifiedFocalLoss
-            >>> pred = torch.ones((1,1,32,32), dtype=torch.float32)
-            >>> grnd = torch.ones((1,1,32,32), dtype=torch.int64)
-            >>> fl = AsymmetricUnifiedFocalLoss(to_onehot_y=True)
-            >>> fl(pred, grnd)
+            weight: The weighting factor 'lambda' between Focal Loss and Asymmetric Focal Tversky Loss.
+            delta: weight of the background class (used in Tversky). Defaults to 0.6.
+            gamma: value of the exponent gamma in the definition of the Focal loss. Defaults to 0.5.
+            include_background: if False, channel index 0 (background category) is excluded.
+            to_onehot_y: whether to convert `y` into the one-hot format. Defaults to False.
+            use_softmax: whether to use softmax to transform the original logits into probabilities.
+                If True, softmax is used. If False, sigmoid is used. Defaults to False.
+            reduction: Specifies the reduction to apply to the output. Defaults to ``"mean"``.
         """
         super().__init__(reduction=LossReduction(reduction).value)
-        self.to_onehot_y = to_onehot_y
-        self.num_classes = num_classes
-        self.gamma = gamma
-        self.delta = delta
-        self.weight: float = weight
-        self.asy_focal_loss = AsymmetricFocalLoss(gamma=self.gamma, delta=self.delta)
-        self.asy_focal_tversky_loss = AsymmetricFocalTverskyLoss(gamma=self.gamma, delta=self.delta)
+        self.weight = weight
+        self.use_softmax = use_softmax  # 儲存參數
+
+        self.focal_loss = FocalLoss(
+            include_background=include_background,
+            to_onehot_y=to_onehot_y,
+            gamma=gamma,
+            reduction=reduction,
+            use_softmax=use_softmax,
+        )
+
+        self.asy_focal_tversky_loss = AsymmetricFocalTverskyLoss(
+            include_background=include_background,
+            to_onehot_y=to_onehot_y,
+            delta=delta,
+            gamma=gamma,
+            reduction=reduction,
+        )
 
-    # TODO: Implement this  function to support multiple classes segmentation
     def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            y_pred : the shape should be BNH[WD], where N is the number of classes.
-                It only supports binary segmentation.
-                The input should be the original logits since it will be transformed by
-                    a sigmoid in the forward function.
-            y_true : the shape should be BNH[WD], where N is the number of classes.
-                It only supports binary segmentation.
-
-        Raises:
-            ValueError: When input and target are different shape
-            ValueError: When len(y_pred.shape) != 4 and len(y_pred.shape) != 5
-            ValueError: When num_classes
-            ValueError: When the number of classes entered does not match the expected number
+            y_pred: (BNH[WD]) Logits (raw scores).
+            y_true: (BNH[WD]) Ground truth labels.
         """
-        if y_pred.shape != y_true.shape:
-            raise ValueError(f"ground truth has different shape ({y_true.shape}) from input ({y_pred.shape})")
-
-        if len(y_pred.shape) != 4 and len(y_pred.shape) != 5:
-            raise ValueError(f"input shape must be 4 or 5, but got {y_pred.shape}")
-
-        if y_pred.shape[1] == 1:
-            y_pred = one_hot(y_pred, num_classes=self.num_classes)
-            y_true = one_hot(y_true, num_classes=self.num_classes)
+        focal_loss = self.focal_loss(y_pred, y_true)
 
-        if torch.max(y_true) != self.num_classes - 1:
-            raise ValueError(f"Please make sure the number of classes is {self.num_classes-1}")
-
-        n_pred_ch = y_pred.shape[1]
-        if self.to_onehot_y:
-            if n_pred_ch == 1:
-                warnings.warn("single channel prediction, `to_onehot_y=True` ignored.")
-            else:
-                y_true = one_hot(y_true, num_classes=n_pred_ch)
+        if self.use_softmax:
+            y_pred_prob = torch.softmax(y_pred, dim=1)
+        else:
+            y_pred_prob = torch.sigmoid(y_pred)
 
-        asy_focal_loss = self.asy_focal_loss(y_pred, y_true)
-        asy_focal_tversky_loss = self.asy_focal_tversky_loss(y_pred, y_true)
+        tversky_loss = self.asy_focal_tversky_loss(y_pred_prob, y_true)
 
-        loss: torch.Tensor = self.weight * asy_focal_loss + (1 - self.weight) * asy_focal_tversky_loss
+        loss = self.weight * focal_loss + (1 - self.weight) * tversky_loss
 
-        if self.reduction == LossReduction.SUM.value:
-            return torch.sum(loss)  # sum over the batch and channel dims
-        if self.reduction == LossReduction.NONE.value:
-            return loss  # returns [N, num_classes] losses
-        if self.reduction == LossReduction.MEAN.value:
-            return torch.mean(loss)
-        raise ValueError(f'Unsupported reduction: {self.reduction}, available options are ["mean", "sum", "none"].')
+        return loss