From 537945fbce8f9a315ebaa19865ca895ce671ebbb Mon Sep 17 00:00:00 2001 From: Virginia Fernandez Date: Fri, 19 Sep 2025 15:22:20 +0100 Subject: [PATCH 1/8] Changes on diffusion model encoder to avoid dimension hard-coding and crash. --- monai/networks/nets/diffusion_model_unet.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py index 113aa505e9..e8aeff4868 100644 --- a/monai/networks/nets/diffusion_model_unet.py +++ b/monai/networks/nets/diffusion_model_unet.py @@ -37,10 +37,12 @@ import torch from torch import nn +import numpy as np from monai.networks.blocks import Convolution, CrossAttentionBlock, MLPBlock, SABlock, SpatialAttentionBlock, Upsample from monai.networks.layers.factories import Pool from monai.utils import ensure_tuple_rep, optional_import +from functools import reduce Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange") @@ -1882,6 +1884,7 @@ class DiffusionModelEncoder(nn.Module): spatial_dims: number of spatial dimensions. in_channels: number of input channels. out_channels: number of output channels. + input_shape: spatial shape of the input (without batch and channel dims). num_res_blocks: number of residual blocks (see _ResnetBlock) per level. channels: tuple of block output channels. attention_levels: list of levels to add attention. @@ -1901,6 +1904,7 @@ def __init__( spatial_dims: int, in_channels: int, out_channels: int, + input_shape: Sequence[int], num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), channels: Sequence[int] = (32, 64, 64, 64), attention_levels: Sequence[bool] = (False, False, True, True), @@ -2007,7 +2011,15 @@ def __init__( self.down_blocks.append(down_block) - self.out: Optional[nn.Module] = None + for _ in channels: + input_shape = [int(np.ceil(i_/2)) for i_ in input_shape] + + last_dim_flattened = reduce(lambda x, y: x*y, input_shape) * self.down_blocks[-1].downsampler.op.conv.out_channels + self.out: Optional[nn.Module] = nn.Sequential( + nn.Linear(last_dim_flattened, 512), + nn.ReLU(), nn.Dropout(0.1), + nn.Linear(512, self.out_channels) + ) def forward( self, From d5856ecb1bd1126ac5ec5050ae79d885b1dfb2e5 Mon Sep 17 00:00:00 2001 From: Virginia Fernandez Date: Fri, 19 Sep 2025 15:30:06 +0100 Subject: [PATCH 2/8] fix mypy issues. --- monai/networks/nets/diffusion_model_unet.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py index e8aeff4868..f24386858b 100644 --- a/monai/networks/nets/diffusion_model_unet.py +++ b/monai/networks/nets/diffusion_model_unet.py @@ -33,16 +33,16 @@ import math from collections.abc import Sequence +from functools import reduce from typing import Optional +import numpy as np import torch from torch import nn -import numpy as np from monai.networks.blocks import Convolution, CrossAttentionBlock, MLPBlock, SABlock, SpatialAttentionBlock, Upsample from monai.networks.layers.factories import Pool from monai.utils import ensure_tuple_rep, optional_import -from functools import reduce Rearrange, _ = optional_import("einops.layers.torch", name="Rearrange") @@ -1904,7 +1904,7 @@ def __init__( spatial_dims: int, in_channels: int, out_channels: int, - input_shape: Sequence[int], + input_shape: Sequence[int], num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), channels: Sequence[int] = (32, 64, 64, 64), attention_levels: Sequence[bool] = (False, False, True, True), @@ -2012,14 +2012,13 @@ def __init__( self.down_blocks.append(down_block) for _ in channels: - input_shape = [int(np.ceil(i_/2)) for i_ in input_shape] + input_shape = [int(np.ceil(i_ / 2)) for i_ in input_shape] + + last_dim_flattened = int(reduce(lambda x, y: x * y, input_shape) * channels[-1]) - last_dim_flattened = reduce(lambda x, y: x*y, input_shape) * self.down_blocks[-1].downsampler.op.conv.out_channels self.out: Optional[nn.Module] = nn.Sequential( - nn.Linear(last_dim_flattened, 512), - nn.ReLU(), nn.Dropout(0.1), - nn.Linear(512, self.out_channels) - ) + nn.Linear(last_dim_flattened, 512), nn.ReLU(), nn.Dropout(0.1), nn.Linear(512, self.out_channels) + ) def forward( self, From c9bb8f7f6bf699dbd78d9c847d92a49f09525f81 Mon Sep 17 00:00:00 2001 From: Virginia Fernandez Date: Fri, 19 Sep 2025 15:36:49 +0100 Subject: [PATCH 3/8] out should never be none. Co-authored-by: Virginia Fernandez --- monai/networks/nets/diffusion_model_unet.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py index f24386858b..2e013d4d3b 100644 --- a/monai/networks/nets/diffusion_model_unet.py +++ b/monai/networks/nets/diffusion_model_unet.py @@ -2063,10 +2063,9 @@ def forward( h = h.reshape(h.shape[0], -1) # 5. out - if self.out is None: - self.out = nn.Sequential( - nn.Linear(h.shape[1], 512), nn.ReLU(), nn.Dropout(0.1), nn.Linear(512, self.out_channels) - ) + self.out = nn.Sequential( + nn.Linear(h.shape[1], 512), nn.ReLU(), nn.Dropout(0.1), nn.Linear(512, self.out_channels) + ) output: torch.Tensor = self.out(h) return output From cbf5dc58892988ba47daae8f8a7abc3e0c951ba2 Mon Sep 17 00:00:00 2001 From: Virginia Fernandez Date: Fri, 19 Sep 2025 15:57:42 +0100 Subject: [PATCH 4/8] Fix default --- monai/networks/nets/diffusion_model_unet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py index 2e013d4d3b..3e496979bf 100644 --- a/monai/networks/nets/diffusion_model_unet.py +++ b/monai/networks/nets/diffusion_model_unet.py @@ -1904,7 +1904,7 @@ def __init__( spatial_dims: int, in_channels: int, out_channels: int, - input_shape: Sequence[int], + input_shape: Sequence[int] = (64, 64), num_res_blocks: Sequence[int] | int = (2, 2, 2, 2), channels: Sequence[int] = (32, 64, 64, 64), attention_levels: Sequence[bool] = (False, False, True, True), From 59325dc7346aeefa99750d062d11394d8e7ce8f3 Mon Sep 17 00:00:00 2001 From: Virginia Fernandez Date: Fri, 19 Sep 2025 16:52:06 +0100 Subject: [PATCH 5/8] DCO Remediation Commit for Virginia Fernandez I, Virginia Fernandez , hereby add my Signed-off-by to this commit: 537945fbce8f9a315ebaa19865ca895ce671ebbb I, Virginia Fernandez , hereby add my Signed-off-by to this commit: d5856ecb1bd1126ac5ec5050ae79d885b1dfb2e5 I, Virginia Fernandez , hereby add my Signed-off-by to this commit: c9bb8f7f6bf699dbd78d9c847d92a49f09525f81 I, Virginia Fernandez , hereby add my Signed-off-by to this commit: cbf5dc58892988ba47daae8f8a7abc3e0c951ba2 Signed-off-by: Virginia Fernandez --- monai/networks/nets/diffusion_model_unet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py index 3e496979bf..1d86df845e 100644 --- a/monai/networks/nets/diffusion_model_unet.py +++ b/monai/networks/nets/diffusion_model_unet.py @@ -1894,7 +1894,7 @@ class DiffusionModelEncoder(nn.Module): num_head_channels: number of channels in each attention head. with_conditioning: if True add spatial transformers to perform conditioning. transformer_num_layers: number of layers of Transformer blocks to use. - cross_attention_dim: number of context dimensions to use. + cross_attention_dim: number of context dimensions to use. num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds` classes. upcast_attention: if True, upcast attention operations to full precision. """ From 2a75fe30fefe03fc5f8c58365b50275b63a5aa20 Mon Sep 17 00:00:00 2001 From: Virginia Fernandez Date: Fri, 19 Sep 2025 17:00:08 +0100 Subject: [PATCH 6/8] Remove whitespace Signed-off-by: Virginia Fernandez --- monai/networks/nets/diffusion_model_unet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py index 1d86df845e..3e496979bf 100644 --- a/monai/networks/nets/diffusion_model_unet.py +++ b/monai/networks/nets/diffusion_model_unet.py @@ -1894,7 +1894,7 @@ class DiffusionModelEncoder(nn.Module): num_head_channels: number of channels in each attention head. with_conditioning: if True add spatial transformers to perform conditioning. transformer_num_layers: number of layers of Transformer blocks to use. - cross_attention_dim: number of context dimensions to use. + cross_attention_dim: number of context dimensions to use. num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds` classes. upcast_attention: if True, upcast attention operations to full precision. """ From 0cee3730db96c6e417f1e2be77b11f7ebf0f5adb Mon Sep 17 00:00:00 2001 From: Virginia Fernandez Date: Fri, 19 Sep 2025 17:00:45 +0100 Subject: [PATCH 7/8] DCO Remediation Commit for Virginia Fernandez I, Virginia Fernandez , hereby add my Signed-off-by to this commit: 537945fbce8f9a315ebaa19865ca895ce671ebbb I, Virginia Fernandez , hereby add my Signed-off-by to this commit: d5856ecb1bd1126ac5ec5050ae79d885b1dfb2e5 I, Virginia Fernandez , hereby add my Signed-off-by to this commit: c9bb8f7f6bf699dbd78d9c847d92a49f09525f81 I, Virginia Fernandez , hereby add my Signed-off-by to this commit: cbf5dc58892988ba47daae8f8a7abc3e0c951ba2 Signed-off-by: Virginia Fernandez --- monai/networks/nets/diffusion_model_unet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py index 3e496979bf..e5dfc03960 100644 --- a/monai/networks/nets/diffusion_model_unet.py +++ b/monai/networks/nets/diffusion_model_unet.py @@ -1891,7 +1891,7 @@ class DiffusionModelEncoder(nn.Module): norm_num_groups: number of groups for the normalization. norm_eps: epsilon for the normalization. resblock_updown: if True use residual blocks for downsampling. - num_head_channels: number of channels in each attention head. + num_head_channels: number of channels in each attention head. with_conditioning: if True add spatial transformers to perform conditioning. transformer_num_layers: number of layers of Transformer blocks to use. cross_attention_dim: number of context dimensions to use. From 7ca0676c1a25d779ef3cab24fbd1f3ba285aff9d Mon Sep 17 00:00:00 2001 From: Virginia Fernandez Date: Fri, 19 Sep 2025 17:02:08 +0100 Subject: [PATCH 8/8] Remove whitespace Signed-off-by: Virginia Fernandez --- monai/networks/nets/diffusion_model_unet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/networks/nets/diffusion_model_unet.py b/monai/networks/nets/diffusion_model_unet.py index e5dfc03960..3e496979bf 100644 --- a/monai/networks/nets/diffusion_model_unet.py +++ b/monai/networks/nets/diffusion_model_unet.py @@ -1891,7 +1891,7 @@ class DiffusionModelEncoder(nn.Module): norm_num_groups: number of groups for the normalization. norm_eps: epsilon for the normalization. resblock_updown: if True use residual blocks for downsampling. - num_head_channels: number of channels in each attention head. + num_head_channels: number of channels in each attention head. with_conditioning: if True add spatial transformers to perform conditioning. transformer_num_layers: number of layers of Transformer blocks to use. cross_attention_dim: number of context dimensions to use.