diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index 2a8d3df22..0d9f863bf 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -30,8 +30,7 @@ DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, ENCODER_NAME, - get_attention_scores_sd2, - get_attention_scores_sd15, + get_attention_scores_sd, get_attention_scores_sdxl, ) from ...utils import ( @@ -54,10 +53,7 @@ "Please update diffusers by running `pip install --upgrade diffusers`" ) from diffusers import ControlNetModel, UNet2DConditionModel - from diffusers.models.attention_processor import ( - Attention, - AttnProcessor, - ) + from diffusers.models.attention_processor import Attention if TYPE_CHECKING: @@ -388,7 +384,6 @@ def get_submodels_for_export_stable_diffusion( models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, copy.deepcopy(text_encoder_2))) # U-NET - pipeline.unet.set_attn_processor(AttnProcessor()) pipeline.unet.config.text_encoder_projection_dim = projection_dim # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score` # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571 @@ -400,12 +395,9 @@ def get_submodels_for_export_stable_diffusion( if is_sdxl: logger.info("Applying optimized attention score computation for sdxl.") Attention.get_attention_scores = get_attention_scores_sdxl - elif "v1-5" in pipeline.config._name_or_path: - logger.info("Applying optimized attention score computation for stable diffusion 1.5.") - Attention.get_attention_scores = get_attention_scores_sd15 else: - logger.info("Applying optimized attention score computation for stable diffusion 2.") - Attention.get_attention_scores = get_attention_scores_sd2 + logger.info("Applying optimized attention score computation for stable diffusion.") + Attention.get_attention_scores = get_attention_scores_sd else: logger.warning( "You are not applying optimized attention score computation. If you want better performance, please" diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py index e2253306d..2f305947f 100644 --- a/optimum/neuron/utils/__init__.py +++ b/optimum/neuron/utils/__init__.py @@ -52,8 +52,7 @@ ], "model_utils": ["get_tied_parameters_dict", "tie_parameters"], "optimization_utils": [ - "get_attention_scores_sd2", - "get_attention_scores_sd15", + "get_attention_scores_sd", "get_attention_scores_sdxl", ], "patching": [ @@ -105,8 +104,7 @@ ) from .model_utils import get_tied_parameters_dict, tie_parameters from .optimization_utils import ( - get_attention_scores_sd2, - get_attention_scores_sd15, + get_attention_scores_sd, get_attention_scores_sdxl, ) from .patching import ( diff --git a/optimum/neuron/utils/optimization_utils.py b/optimum/neuron/utils/optimization_utils.py index 5c1c5e70c..feff8e98c 100644 --- a/optimum/neuron/utils/optimization_utils.py +++ b/optimum/neuron/utils/optimization_utils.py @@ -17,43 +17,8 @@ import torch -def get_attention_scores_sd15(self, query, key, attention_mask) -> torch.Tensor: - """Optimized attention for Stable Diffusion 1.5 UNET.""" - dtype = query.dtype - - if self.upcast_attention: - query = query.float() - key = key.float() - - baddbmm_input = torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device) - beta = 0 - - attention_scores = torch.baddbmm( - baddbmm_input, - query, - key.transpose(-1, -2), - beta=beta, - alpha=self.scale, - ) - del baddbmm_input - - # TODO: following line is supposed to give the same result and reduce unnecessary overhead(no attention mask) - # however the compiled model output is far off from the one on cpu, need to further investigate. - # attention_scores = self.scale * torch.bmm(query, key.transpose(-1, -2)) # -> bad perf, max diff: 5.696073055267334 (atol: 0.001) - - if self.upcast_softmax: - attention_scores = attention_scores.float() - - attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1) - del attention_scores - - attention_probs = attention_probs.to(dtype) - - return attention_probs - - -def get_attention_scores_sd2(self, query, key, attn_mask): - """Optimized attention for Stable Diffusion 2 UNET.""" +def get_attention_scores_sd(self, query, key, attn_mask): + """Optimized attention for Stable Diffusion UNET.""" dtype = query.dtype if self.upcast_attention: