-
Notifications
You must be signed in to change notification settings - Fork 446
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
How to put vtoonify on multiple people in this project? #78
Comments
Our project is mainly designed to toonify a single person. |
The boundary of the square vtoonify is too clear. Can you find the problem in this code? import os
import argparse
import numpy as np
import cv2
import torch
from torchvision import transforms
import torch.nn.functional as F
from tqdm import tqdm
from model.vtoonify import VToonify
from util import save_image, tensor2cv2, load_psp_standalone
from PIL import Image
import dlib
from model.bisenet.model import BiSeNet
class TestOptions():
def __init__(self):
self.parser = argparse.ArgumentParser(description="Style Transfer")
self.parser.add_argument("--content", type=str, default='./data/077436.jpg', help="path of the content image/video")
self.parser.add_argument("--style_id", type=int, default=26, help="the id of the style image")
self.parser.add_argument("--style_degree", type=float, default=0.5, help="style degree for VToonify-D")
self.parser.add_argument("--color_transfer", action="store_true", help="transfer the color of the style")
self.parser.add_argument("--ckpt", type=str, default='./checkpoint/vtoonify_d_cartoon/vtoonify_s_d_c.pt', help="path of the saved model")
self.parser.add_argument("--output_path", type=str, default='./output/', help="path of the output images")
self.parser.add_argument("--style_encoder_path", type=str, default='./checkpoint/encoder.pt', help="path of the style encoder")
self.parser.add_argument("--exstyle_path", type=str, default=None, help="path of the extrinsic style code")
self.parser.add_argument("--faceparsing_path", type=str, default='./checkpoint/faceparsing.pth', help="path of the face parsing model")
self.parser.add_argument("--video", action="store_true", help="if true, video stylization; if false, image stylization")
self.parser.add_argument("--cpu", action="store_true", help="if true, only use cpu")
self.parser.add_argument("--backbone", type=str, default='dualstylegan', help="dualstylegan | toonify")
self.parser.add_argument("--batch_size", type=int, default=4, help="batch size of frames when processing video")
self.parser.add_argument("--yolo_model_path", type=str, default='../face_recognition_2/627.pt', help="path to the YOLO model")
def parse(self):
self.opt = self.parser.parse_args()
args = vars(self.opt)
print('Load options')
for name, value in sorted(args.items()):
print(f'{name}: {value}')
return self.opt
def detect_faces_yolo(model, img, confidence_t=0.5, face_class=4):
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results = model(img_rgb)
faces = []
for det in results.xyxy[0]:
x1, y1, x2, y2, conf, cls = det
cls = int(cls)
if conf >= confidence_t and cls == face_class:
faces.append((int(x1), int(y1), int(x2), int(y2)))
return faces
def detect_landmarks_dlib(image, predictor, x1, y1, x2, y2):
face_roi = image[y1:y2, x1:x2]
gray = cv2.cvtColor(face_roi, cv2.COLOR_BGR2GRAY)
detector = dlib.get_frontal_face_detector()
rects = detector(gray, 1)
if len(rects) == 0:
print("No landmarks detected in the face region")
return None
for rect in rects:
shape = predictor(gray, rect)
landmarks = np.array([[p.x, p.y] for p in shape.parts()])
landmarks[:, 0] += x1
landmarks[:, 1] += y1
return landmarks
def align_face(image, landmarks):
lm_eye_left = landmarks[36:42]
lm_eye_right = landmarks[42:48]
eye_left = np.mean(lm_eye_left, axis=0)
eye_right = np.mean(lm_eye_right, axis=0)
eye_avg = (eye_left + eye_right) * 0.5
eye_to_eye = eye_right - eye_left
mouth_avg = (landmarks[48] + landmarks[54]) * 0.5
eye_to_mouth = mouth_avg - eye_avg
x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
x /= np.hypot(*x)
x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
y = np.flipud(x) * [-1, 1]
c = eye_avg + eye_to_mouth * 0.1
quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
qsize = np.hypot(*x) * 2
img = Image.fromarray(image)
img = img.transform((256, 256), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
return img
if __name__ == "__main__":
parser = TestOptions()
args = parser.parse()
print('*'*98)
device = "cpu" if args.cpu else "cuda"
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
confidence_t = 0.5
yolo_model = torch.hub.load('ultralytics/yolov5', 'custom', path=args.yolo_model_path, force_reload=True).to(device)
yolo_model.conf = confidence_t
yolo_model.classes = None
yolo_model.agnostic_nms = False
vtoonify = VToonify(backbone=args.backbone)
vtoonify.load_state_dict(torch.load(args.ckpt, map_location=lambda storage, loc: storage)['g_ema'])
vtoonify.to(device)
pspencoder = load_psp_standalone(args.style_encoder_path, device)
if args.backbone == 'dualstylegan':
exstyles = np.load(args.exstyle_path, allow_pickle='TRUE').item()
stylename = list(exstyles.keys())[args.style_id]
exstyle = torch.tensor(exstyles[stylename]).to(device)
with torch.no_grad():
exstyle = vtoonify.zplus2wplus(exstyle)
print('Load models successfully!')
filename = args.content
basename = os.path.basename(filename).split('.')[0]
print(f'Processing {filename} with vtoonify_{args.backbone[0]}')
predictor = dlib.shape_predictor('./checkpoint/shape_predictor_68_face_landmarks.dat')
if args.video:
video_cap = cv2.VideoCapture(filename)
num = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))
output_video_path = os.path.join(args.output_path, f"{basename}_stylized.mp4")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = int(video_cap.get(cv2.CAP_PROP_FPS))
frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
videoWriter = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
parsingpredictor = BiSeNet(n_classes=19)
parsingpredictor.load_state_dict(torch.load(args.faceparsing_path, map_location=lambda storage, loc: storage))
parsingpredictor.to(device).eval()
for i in tqdm(range(num)):
success, frame = video_cap.read()
if not success:
break
frame_copy = frame.copy()
faces = detect_faces_yolo(yolo_model, frame, confidence_t)
if not faces:
videoWriter.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
continue
for (x1, y1, x2, y2) in faces:
landmarks = detect_landmarks_dlib(frame, predictor, x1, y1, x2, y2)
if landmarks is None:
continue
aligned_face = align_face(frame, landmarks)
face_tensor = transform(aligned_face).unsqueeze(dim=0).to(device)
with torch.no_grad():
x_p = F.interpolate(parsingpredictor(2 * (F.interpolate(face_tensor, scale_factor=2, mode='bilinear', align_corners=False)))[0],
scale_factor=0.5, recompute_scale_factor=False).detach()
inputs = torch.cat((face_tensor, x_p / 16.), dim=1)
with torch.no_grad():
s_w = pspencoder(face_tensor)
s_w = vtoonify.zplus2wplus(s_w)
if args.backbone == 'dualstylegan':
s_w[:, :7] = exstyle[:, :7]
y_tilde = vtoonify(inputs, s_w.repeat(inputs.size(0), 1, 1), d_s=args.style_degree)
y_tilde = torch.clamp(y_tilde, -1, 1)
stylized_face_np = tensor2cv2(y_tilde[0].cpu())
stylized_face_np_bgr = cv2.cvtColor(stylized_face_np, cv2.COLOR_RGB2BGR)
frame_copy[y1:y2, x1:x2] = cv2.resize(stylized_face_np_bgr, (x2 - x1, y2 - y1))
frame_bgr = cv2.cvtColor(frame_copy, cv2.COLOR_RGB2BGR)
videoWriter.write(cv2.cvtColor(frame_bgr, cv2.COLOR_RGB2BGR))
videoWriter.release()
video_cap.release()
print('Transfer style successfully!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I think the output video is weird because I've tried vtoonify on multiple people, but I didn't recognize their faces properly and applied a style to the background. Is there a way to make vtoonify on multiple people? How do I fix style_transfer.py?
The text was updated successfully, but these errors were encountered: