Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ip-adapter plus is not working with full face #2400

Open
circuluspibo opened this issue Sep 19, 2024 · 6 comments
Open

ip-adapter plus is not working with full face #2400

circuluspibo opened this issue Sep 19, 2024 · 6 comments
Assignees
Labels
PSE Escalate to PSE for further investigate support_request

Comments

@circuluspibo
Copy link

Hi. I am trying to use ip-adapter with openvino,
two model is working well (ip-adapter_sd15.bin, ip-adapter_sd15_light.bin)
but, I wish to use "ip-adapter-full-face_sd15.bin", it is not working well
some modification of example is avaiable to convert that model, but inferencing is fail

how can I use that model with sd1.5?

@eaidova
Copy link
Collaborator

eaidova commented Sep 19, 2024

@circuluspibo could you please provide more details about error and which modifications do you perform?

@circuluspibo
Copy link
Author

circuluspibo commented Sep 19, 2024

from pathlib import Path
from diffusers import AutoPipelineForText2Image
from transformers import CLIPVisionModelWithProjection
from diffusers.utils import load_image
from diffusers import LCMScheduler


stable_diffusion_id = "circulus/canvers-real-v3.9.1"
ip_adapter_id = "h94/IP-Adapter"
ip_adapter_weight_name = "ip-adapter-full-face_sd15.bin" # "ip-adapter_sd15.bin" changing other adapter model!!!!
lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5"
models_dir = Path("on-canvers-real-v3.9.1-ov-ip")
int8_model_path = Path("on-canvers-real-v3.9.1-ov-ip-int8")
from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig
from optimum.intel.openvino.configuration import OVQuantizationMethod

load_original_pipeline = not all(
    [
        (models_dir / model_name).exists()
        for model_name in [
            "text_encoder.xml",
            "image_encoder.xml",
            "unet.xml",
            "vae_decoder.xml",
            "vae_encoder.xml",
        ]
    ]
)


def get_pipeline_components(
    stable_diffusion_id,
    ip_adapter_id,
    ip_adapter_weight_name,
    lcm_lora_id,
    ip_adapter_scale=0.65,
):
    image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="models/image_encoder")
    print(image_encoder)
    pipeline = AutoPipelineForText2Image.from_pretrained(stable_diffusion_id, image_encoder=image_encoder)
    pipeline.load_lora_weights(lcm_lora_id)
    pipeline.fuse_lora()
    pipeline.load_ip_adapter(ip_adapter_id, subfolder="models", weight_name=ip_adapter_weight_name)
    pipeline.set_ip_adapter_scale(ip_adapter_scale)
    scheduler = LCMScheduler.from_pretrained(stable_diffusion_id, subfolder="scheduler")
    return (
        pipeline.tokenizer,
        pipeline.feature_extractor,
        scheduler,
        pipeline.text_encoder,
        pipeline.image_encoder,
        pipeline.unet,
        pipeline.vae,
    )


if load_original_pipeline:
    (
        tokenizer,
        feature_extractor,
        scheduler,
        text_encoder,
        image_encoder,
        unet,
        vae,
    ) = get_pipeline_components(stable_diffusion_id, ip_adapter_id, ip_adapter_weight_name, lcm_lora_id)
    scheduler.save_pretrained(models_dir / "scheduler")
else:
    tokenizer, feature_extractor, scheduler, text_encoder, image_encoder, unet, vae = (
        None,
        None,
        None,
        None,
        None,
        None,
        None,
    )

import openvino as ov
import torch
import gc


def cleanup_torchscript_cache():
    """
    Helper for removing cached model representation
    """
    torch._C._jit_clear_class_registry()
    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
    torch.jit._state._clear_class_state()

IMAGE_ENCODER_PATH = models_dir / "image_encoder.xml"
UNET_PATH = models_dir / "unet.xml"
VAE_DECODER_PATH = models_dir / "vae_decoder.xml"
VAE_ENCODER_PATH = models_dir / "vae_encoder.xml"
TEXT_ENCODER_PATH = models_dir / "text_encoder.xml"

if not IMAGE_ENCODER_PATH.exists():
    with torch.no_grad():
        ov_model = ov.convert_model(
            image_encoder,
            example_input=torch.zeros((1, 3, 224, 224)),
            input=[-1, 3, 224, 224],
        )
    ov.save_model(ov_model, IMAGE_ENCODER_PATH)
    feature_extractor.save_pretrained(models_dir / "feature_extractor")
    del ov_model
    cleanup_torchscript_cache()


if not UNET_PATH.exists():
    inputs = {
        "sample": torch.randn((2, 4, 64, 64)),
        "timestep": torch.tensor(1),
        "encoder_hidden_states": torch.randn((2, 77, 768)),
        "added_cond_kwargs": {"image_embeds": torch.ones(( 2,1280))}, #torch.ones((2, 1024) -  original emveds change for avoid failing convert
    }

    print(unet)

    with torch.no_grad():
        ov_model = ov.convert_model(unet, example_input=inputs)
    # dictionary with added_cond_kwargs will be decomposed during conversion
    # in some cases decomposition may lead to losing data type and shape information
    # We need to recover it manually after the conversion
    ov_model.inputs[-1].get_node().set_element_type(ov.Type.f32)
    ov_model.validate_nodes_and_infer_types()
    ov.save_model(ov_model, UNET_PATH)
    del ov_model
    cleanup_torchscript_cache()

if not VAE_DECODER_PATH.exists():

    class VAEDecoderWrapper(torch.nn.Module):
        def __init__(self, vae):
            super().__init__()
            self.vae = vae

        def forward(self, latents):
            return self.vae.decode(latents)

    vae_decoder = VAEDecoderWrapper(vae)
    with torch.no_grad():
        ov_model = ov.convert_model(vae_decoder, example_input=torch.ones([1, 4, 64, 64]))
    ov.save_model(ov_model, VAE_DECODER_PATH)
    del ov_model
    cleanup_torchscript_cache()
    del vae_decoder

if not VAE_ENCODER_PATH.exists():

    class VAEEncoderWrapper(torch.nn.Module):
        def __init__(self, vae):
            super().__init__()
            self.vae = vae

        def forward(self, image):
            return self.vae.encode(x=image)["latent_dist"].sample()

    vae_encoder = VAEEncoderWrapper(vae)
    vae_encoder.eval()
    image = torch.zeros((1, 3, 512, 512))
    with torch.no_grad():
        ov_model = ov.convert_model(vae_encoder, example_input=image)
    ov.save_model(ov_model, VAE_ENCODER_PATH)
    del ov_model
    cleanup_torchscript_cache()


if not TEXT_ENCODER_PATH.exists():
    with torch.no_grad():
        ov_model = ov.convert_model(
            text_encoder,
            example_input=torch.ones([1, 77], dtype=torch.long),
            input=[
                (1, 77),
            ],
        )
    ov.save_model(ov_model, TEXT_ENCODER_PATH)
    del ov_model
    cleanup_torchscript_cache()
    tokenizer.save_pretrained(models_dir / "tokenizer")

converted okay, but fail to inference. It seems to be changing image_embeds difference between original ip_adapter and ip_adapter_plus series. (1024 vs 1280)

@Iffa-Intel
Copy link

Hi @circuluspibo,

Could you clarify:

  1. Which OpenVINO Notebook demo are you using?
  2. Does the model is custom/modified or pre-trained?
  3. If custom/modified, could you share it with us for validation purposes?

@circuluspibo
Copy link
Author

@Iffa-Intel
Copy link

Iffa-Intel commented Oct 4, 2024

@circuluspibo do you have the xml file? I notice there are only bin files in that link.
Plus, which OpenVINO Notebook demo code did you use to infer the model? or are you actually using custom inferencing code?

@Iffa-Intel Iffa-Intel added the PSE Escalate to PSE for further investigate label Oct 8, 2024
@avitial avitial self-assigned this Oct 10, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
PSE Escalate to PSE for further investigate support_request
Projects
None yet
Development

No branches or pull requests

5 participants