Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How to put vtoonify on multiple people in this project? #78

Open
Ohjunghh opened this issue Sep 13, 2024 · 2 comments
Open

How to put vtoonify on multiple people in this project? #78

Ohjunghh opened this issue Sep 13, 2024 · 2 comments

Comments

@Ohjunghh
Copy link

I think the output video is weird because I've tried vtoonify on multiple people, but I didn't recognize their faces properly and applied a style to the background. Is there a way to make vtoonify on multiple people? How do I fix style_transfer.py?

@williamyang1991
Copy link
Owner

Our project is mainly designed to toonify a single person.
To stylize multiple people, maybe you should crop each person out and toonify seperately, and fuse the results back to a single frame

@Ohjunghh
Copy link
Author

Ohjunghh commented Sep 19, 2024

The boundary of the square vtoonify is too clear. Can you find the problem in this code?

import os
import argparse
import numpy as np
import cv2
import torch
from torchvision import transforms
import torch.nn.functional as F
from tqdm import tqdm
from model.vtoonify import VToonify
from util import save_image, tensor2cv2, load_psp_standalone
from PIL import Image
import dlib
from model.bisenet.model import BiSeNet


class TestOptions():
    def __init__(self):
        self.parser = argparse.ArgumentParser(description="Style Transfer")
        self.parser.add_argument("--content", type=str, default='./data/077436.jpg', help="path of the content image/video")
        self.parser.add_argument("--style_id", type=int, default=26, help="the id of the style image")
        self.parser.add_argument("--style_degree", type=float, default=0.5, help="style degree for VToonify-D")
        self.parser.add_argument("--color_transfer", action="store_true", help="transfer the color of the style")
        self.parser.add_argument("--ckpt", type=str, default='./checkpoint/vtoonify_d_cartoon/vtoonify_s_d_c.pt', help="path of the saved model")
        self.parser.add_argument("--output_path", type=str, default='./output/', help="path of the output images")
        self.parser.add_argument("--style_encoder_path", type=str, default='./checkpoint/encoder.pt', help="path of the style encoder")
        self.parser.add_argument("--exstyle_path", type=str, default=None, help="path of the extrinsic style code")
        self.parser.add_argument("--faceparsing_path", type=str, default='./checkpoint/faceparsing.pth', help="path of the face parsing model")
        self.parser.add_argument("--video", action="store_true", help="if true, video stylization; if false, image stylization")
        self.parser.add_argument("--cpu", action="store_true", help="if true, only use cpu")
        self.parser.add_argument("--backbone", type=str, default='dualstylegan', help="dualstylegan | toonify")
        self.parser.add_argument("--batch_size", type=int, default=4, help="batch size of frames when processing video")
        self.parser.add_argument("--yolo_model_path", type=str, default='../face_recognition_2/627.pt', help="path to the YOLO model")
       
    def parse(self):
        self.opt = self.parser.parse_args()
        args = vars(self.opt)
        print('Load options')
        for name, value in sorted(args.items()):
            print(f'{name}: {value}')
        return self.opt


def detect_faces_yolo(model, img, confidence_t=0.5, face_class=4):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = model(img_rgb)

    faces = []
    for det in results.xyxy[0]:
        x1, y1, x2, y2, conf, cls = det
        cls = int(cls)
        if conf >= confidence_t and cls == face_class:
            faces.append((int(x1), int(y1), int(x2), int(y2)))
    return faces


def detect_landmarks_dlib(image, predictor, x1, y1, x2, y2):
    face_roi = image[y1:y2, x1:x2]
    gray = cv2.cvtColor(face_roi, cv2.COLOR_BGR2GRAY)
   
    detector = dlib.get_frontal_face_detector()
    rects = detector(gray, 1)
   
    if len(rects) == 0:
        print("No landmarks detected in the face region")
        return None

    for rect in rects:
        shape = predictor(gray, rect)
        landmarks = np.array([[p.x, p.y] for p in shape.parts()])

    landmarks[:, 0] += x1
    landmarks[:, 1] += y1

    return landmarks


def align_face(image, landmarks):
    lm_eye_left = landmarks[36:42]
    lm_eye_right = landmarks[42:48]

    eye_left = np.mean(lm_eye_left, axis=0)
    eye_right = np.mean(lm_eye_right, axis=0)
    eye_avg = (eye_left + eye_right) * 0.5
    eye_to_eye = eye_right - eye_left
    mouth_avg = (landmarks[48] + landmarks[54]) * 0.5
    eye_to_mouth = mouth_avg - eye_avg

    x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
    x /= np.hypot(*x)
    x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
    y = np.flipud(x) * [-1, 1]
    c = eye_avg + eye_to_mouth * 0.1
    quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
    qsize = np.hypot(*x) * 2

    img = Image.fromarray(image)
    img = img.transform((256, 256), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
   
    return img


if __name__ == "__main__":
    parser = TestOptions()
    args = parser.parse()
    print('*'*98)
   
    device = "cpu" if args.cpu else "cuda"
   
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
   
    confidence_t = 0.5
    yolo_model = torch.hub.load('ultralytics/yolov5', 'custom', path=args.yolo_model_path, force_reload=True).to(device)
    yolo_model.conf = confidence_t
    yolo_model.classes = None
    yolo_model.agnostic_nms = False

    vtoonify = VToonify(backbone=args.backbone)
    vtoonify.load_state_dict(torch.load(args.ckpt, map_location=lambda storage, loc: storage)['g_ema'])
    vtoonify.to(device)

    pspencoder = load_psp_standalone(args.style_encoder_path, device)    
   
    if args.backbone == 'dualstylegan':
        exstyles = np.load(args.exstyle_path, allow_pickle='TRUE').item()
        stylename = list(exstyles.keys())[args.style_id]
        exstyle = torch.tensor(exstyles[stylename]).to(device)
        with torch.no_grad():
            exstyle = vtoonify.zplus2wplus(exstyle)
         
    print('Load models successfully!')

    filename = args.content
    basename = os.path.basename(filename).split('.')[0]
    print(f'Processing {filename} with vtoonify_{args.backbone[0]}')

    predictor = dlib.shape_predictor('./checkpoint/shape_predictor_68_face_landmarks.dat')

    if args.video:
        video_cap = cv2.VideoCapture(filename)
        num = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        output_video_path = os.path.join(args.output_path, f"{basename}_stylized.mp4")
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        fps = int(video_cap.get(cv2.CAP_PROP_FPS))
        frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        videoWriter = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

        parsingpredictor = BiSeNet(n_classes=19)
        parsingpredictor.load_state_dict(torch.load(args.faceparsing_path, map_location=lambda storage, loc: storage))
        parsingpredictor.to(device).eval()

        for i in tqdm(range(num)):
            success, frame = video_cap.read()
            if not success:
                break

            frame_copy = frame.copy()
           
            faces = detect_faces_yolo(yolo_model, frame, confidence_t)
            if not faces:
                videoWriter.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
                continue

            for (x1, y1, x2, y2) in faces:
                landmarks = detect_landmarks_dlib(frame, predictor, x1, y1, x2, y2)
                if landmarks is None:
                    continue

                aligned_face = align_face(frame, landmarks)
                face_tensor = transform(aligned_face).unsqueeze(dim=0).to(device)

                with torch.no_grad():
                    x_p = F.interpolate(parsingpredictor(2 * (F.interpolate(face_tensor, scale_factor=2, mode='bilinear', align_corners=False)))[0],
                                        scale_factor=0.5, recompute_scale_factor=False).detach()

                inputs = torch.cat((face_tensor, x_p / 16.), dim=1)

                with torch.no_grad():
                    s_w = pspencoder(face_tensor)
                    s_w = vtoonify.zplus2wplus(s_w)
                    if args.backbone == 'dualstylegan':
                        s_w[:, :7] = exstyle[:, :7]
                       
                    y_tilde = vtoonify(inputs, s_w.repeat(inputs.size(0), 1, 1), d_s=args.style_degree)
                    y_tilde = torch.clamp(y_tilde, -1, 1)

                stylized_face_np = tensor2cv2(y_tilde[0].cpu())

                stylized_face_np_bgr = cv2.cvtColor(stylized_face_np, cv2.COLOR_RGB2BGR)
                frame_copy[y1:y2, x1:x2] = cv2.resize(stylized_face_np_bgr, (x2 - x1, y2 - y1))

            frame_bgr = cv2.cvtColor(frame_copy, cv2.COLOR_RGB2BGR)
            videoWriter.write(cv2.cvtColor(frame_bgr, cv2.COLOR_RGB2BGR))

        videoWriter.release()
        video_cap.release()

    print('Transfer style successfully!')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants