How to put vtoonify on multiple people in this project? #78

Ohjunghh · 2024-09-13T06:36:16Z

I think the output video is weird because I've tried vtoonify on multiple people, but I didn't recognize their faces properly and applied a style to the background. Is there a way to make vtoonify on multiple people? How do I fix style_transfer.py?

williamyang1991 · 2024-09-14T04:11:23Z

Our project is mainly designed to toonify a single person.
To stylize multiple people, maybe you should crop each person out and toonify seperately, and fuse the results back to a single frame

Ohjunghh · 2024-09-19T03:20:45Z

The boundary of the square vtoonify is too clear. Can you find the problem in this code?

import os
import argparse
import numpy as np
import cv2
import torch
from torchvision import transforms
import torch.nn.functional as F
from tqdm import tqdm
from model.vtoonify import VToonify
from util import save_image, tensor2cv2, load_psp_standalone
from PIL import Image
import dlib
from model.bisenet.model import BiSeNet


class TestOptions():
    def __init__(self):
        self.parser = argparse.ArgumentParser(description="Style Transfer")
        self.parser.add_argument("--content", type=str, default='./data/077436.jpg', help="path of the content image/video")
        self.parser.add_argument("--style_id", type=int, default=26, help="the id of the style image")
        self.parser.add_argument("--style_degree", type=float, default=0.5, help="style degree for VToonify-D")
        self.parser.add_argument("--color_transfer", action="store_true", help="transfer the color of the style")
        self.parser.add_argument("--ckpt", type=str, default='./checkpoint/vtoonify_d_cartoon/vtoonify_s_d_c.pt', help="path of the saved model")
        self.parser.add_argument("--output_path", type=str, default='./output/', help="path of the output images")
        self.parser.add_argument("--style_encoder_path", type=str, default='./checkpoint/encoder.pt', help="path of the style encoder")
        self.parser.add_argument("--exstyle_path", type=str, default=None, help="path of the extrinsic style code")
        self.parser.add_argument("--faceparsing_path", type=str, default='./checkpoint/faceparsing.pth', help="path of the face parsing model")
        self.parser.add_argument("--video", action="store_true", help="if true, video stylization; if false, image stylization")
        self.parser.add_argument("--cpu", action="store_true", help="if true, only use cpu")
        self.parser.add_argument("--backbone", type=str, default='dualstylegan', help="dualstylegan | toonify")
        self.parser.add_argument("--batch_size", type=int, default=4, help="batch size of frames when processing video")
        self.parser.add_argument("--yolo_model_path", type=str, default='../face_recognition_2/627.pt', help="path to the YOLO model")
       
    def parse(self):
        self.opt = self.parser.parse_args()
        args = vars(self.opt)
        print('Load options')
        for name, value in sorted(args.items()):
            print(f'{name}: {value}')
        return self.opt


def detect_faces_yolo(model, img, confidence_t=0.5, face_class=4):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = model(img_rgb)

    faces = []
    for det in results.xyxy[0]:
        x1, y1, x2, y2, conf, cls = det
        cls = int(cls)
        if conf >= confidence_t and cls == face_class:
            faces.append((int(x1), int(y1), int(x2), int(y2)))
    return faces


def detect_landmarks_dlib(image, predictor, x1, y1, x2, y2):
    face_roi = image[y1:y2, x1:x2]
    gray = cv2.cvtColor(face_roi, cv2.COLOR_BGR2GRAY)
   
    detector = dlib.get_frontal_face_detector()
    rects = detector(gray, 1)
   
    if len(rects) == 0:
        print("No landmarks detected in the face region")
        return None

    for rect in rects:
        shape = predictor(gray, rect)
        landmarks = np.array([[p.x, p.y] for p in shape.parts()])

    landmarks[:, 0] += x1
    landmarks[:, 1] += y1

    return landmarks


def align_face(image, landmarks):
    lm_eye_left = landmarks[36:42]
    lm_eye_right = landmarks[42:48]

    eye_left = np.mean(lm_eye_left, axis=0)
    eye_right = np.mean(lm_eye_right, axis=0)
    eye_avg = (eye_left + eye_right) * 0.5
    eye_to_eye = eye_right - eye_left
    mouth_avg = (landmarks[48] + landmarks[54]) * 0.5
    eye_to_mouth = mouth_avg - eye_avg

    x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
    x /= np.hypot(*x)
    x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
    y = np.flipud(x) * [-1, 1]
    c = eye_avg + eye_to_mouth * 0.1
    quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
    qsize = np.hypot(*x) * 2

    img = Image.fromarray(image)
    img = img.transform((256, 256), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
   
    return img


if __name__ == "__main__":
    parser = TestOptions()
    args = parser.parse()
    print('*'*98)
   
    device = "cpu" if args.cpu else "cuda"
   
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
   
    confidence_t = 0.5
    yolo_model = torch.hub.load('ultralytics/yolov5', 'custom', path=args.yolo_model_path, force_reload=True).to(device)
    yolo_model.conf = confidence_t
    yolo_model.classes = None
    yolo_model.agnostic_nms = False

    vtoonify = VToonify(backbone=args.backbone)
    vtoonify.load_state_dict(torch.load(args.ckpt, map_location=lambda storage, loc: storage)['g_ema'])
    vtoonify.to(device)

    pspencoder = load_psp_standalone(args.style_encoder_path, device)    
   
    if args.backbone == 'dualstylegan':
        exstyles = np.load(args.exstyle_path, allow_pickle='TRUE').item()
        stylename = list(exstyles.keys())[args.style_id]
        exstyle = torch.tensor(exstyles[stylename]).to(device)
        with torch.no_grad():
            exstyle = vtoonify.zplus2wplus(exstyle)
         
    print('Load models successfully!')

    filename = args.content
    basename = os.path.basename(filename).split('.')[0]
    print(f'Processing {filename} with vtoonify_{args.backbone[0]}')

    predictor = dlib.shape_predictor('./checkpoint/shape_predictor_68_face_landmarks.dat')

    if args.video:
        video_cap = cv2.VideoCapture(filename)
        num = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        output_video_path = os.path.join(args.output_path, f"{basename}_stylized.mp4")
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        fps = int(video_cap.get(cv2.CAP_PROP_FPS))
        frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        videoWriter = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

        parsingpredictor = BiSeNet(n_classes=19)
        parsingpredictor.load_state_dict(torch.load(args.faceparsing_path, map_location=lambda storage, loc: storage))
        parsingpredictor.to(device).eval()

        for i in tqdm(range(num)):
            success, frame = video_cap.read()
            if not success:
                break

            frame_copy = frame.copy()
           
            faces = detect_faces_yolo(yolo_model, frame, confidence_t)
            if not faces:
                videoWriter.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
                continue

            for (x1, y1, x2, y2) in faces:
                landmarks = detect_landmarks_dlib(frame, predictor, x1, y1, x2, y2)
                if landmarks is None:
                    continue

                aligned_face = align_face(frame, landmarks)
                face_tensor = transform(aligned_face).unsqueeze(dim=0).to(device)

                with torch.no_grad():
                    x_p = F.interpolate(parsingpredictor(2 * (F.interpolate(face_tensor, scale_factor=2, mode='bilinear', align_corners=False)))[0],
                                        scale_factor=0.5, recompute_scale_factor=False).detach()

                inputs = torch.cat((face_tensor, x_p / 16.), dim=1)

                with torch.no_grad():
                    s_w = pspencoder(face_tensor)
                    s_w = vtoonify.zplus2wplus(s_w)
                    if args.backbone == 'dualstylegan':
                        s_w[:, :7] = exstyle[:, :7]
                       
                    y_tilde = vtoonify(inputs, s_w.repeat(inputs.size(0), 1, 1), d_s=args.style_degree)
                    y_tilde = torch.clamp(y_tilde, -1, 1)

                stylized_face_np = tensor2cv2(y_tilde[0].cpu())

                stylized_face_np_bgr = cv2.cvtColor(stylized_face_np, cv2.COLOR_RGB2BGR)
                frame_copy[y1:y2, x1:x2] = cv2.resize(stylized_face_np_bgr, (x2 - x1, y2 - y1))

            frame_bgr = cv2.cvtColor(frame_copy, cv2.COLOR_RGB2BGR)
            videoWriter.write(cv2.cvtColor(frame_bgr, cv2.COLOR_RGB2BGR))

        videoWriter.release()
        video_cap.release()

    print('Transfer style successfully!')

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to put vtoonify on multiple people in this project? #78

How to put vtoonify on multiple people in this project? #78

Ohjunghh commented Sep 13, 2024

williamyang1991 commented Sep 14, 2024

Ohjunghh commented Sep 19, 2024 •

edited

Loading

How to put vtoonify on multiple people in this project? #78

How to put vtoonify on multiple people in this project? #78

Comments

Ohjunghh commented Sep 13, 2024

williamyang1991 commented Sep 14, 2024

Ohjunghh commented Sep 19, 2024 • edited Loading

Ohjunghh commented Sep 19, 2024 •

edited

Loading