From 9c6377f57067fd06a1fabff2754c5f86ccb2da13 Mon Sep 17 00:00:00 2001
From: Johnson <j3.soon@msa.hinet.net>
Date: Fri, 3 Jul 2020 20:35:44 +0800
Subject: [PATCH] Add code and guide on using custom images

---
 README.md                  |  13 ++
 align_custom_faces.py      | 147 +++++++++++++++++++++
 interactive_demo_custom.py | 254 +++++++++++++++++++++++++++++++++++++
 3 files changed, 414 insertions(+)
 create mode 100644 align_custom_faces.py
 create mode 100644 interactive_demo_custom.py

diff --git a/README.md b/README.md
index bb80fb8d..835cdb11 100644
--- a/README.md
+++ b/README.md
@@ -76,6 +76,19 @@ You can change the config using `-c` parameter. To run on `celeb-hq` in 256x256
 
 However, for configs other then FFHQ, you need to obtain new principal direction vectors for the attributes.
 
+To run the demo on custom images, please follow the steps and commands:
+
+    mkdir custom
+    # Put your custom images in custom/
+    align_custom_faces.py
+    # The cropped faces will be located at:
+    # dataset_samples/faces/realign_custom1024x1024
+    interactive_demo_custom.py
+
+If the min/max of the slide bar isn't suitable for your custom image, modify the following line in `interactive_demo_custom.py`
+
+    bimpy.slider_float(label, v, -40.0, 40.0)
+
 ## Repository organization
 
 #### Running scripts
diff --git a/align_custom_faces.py b/align_custom_faces.py
new file mode 100644
index 00000000..0af8a52e
--- /dev/null
+++ b/align_custom_faces.py
@@ -0,0 +1,147 @@
+# Copyright 2019-2020 Stanislav Pidhorskyi
+#
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# This work is licensed under the Creative Commons Attribution-NonCommercial
+# 4.0 International License. To view a copy of this license, visit
+# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
+# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+
+import os
+import numpy as np
+import dlib
+from PIL import Image
+import PIL
+import scipy
+import scipy.ndimage
+
+# lefteye_x lefteye_y righteye_x righteye_y nose_x nose_y leftmouth_x leftmouth_y rightmouth_x rightmouth_y
+# 69	111	108	111	88	136	72	152	105	152
+# 44	51	83	51	63	76	47	92	80	92
+
+use_1024 = True
+
+
+def align(img, parts, dst_dir='realign_custom1024x1024', output_size=1024, transform_size=4096, item_idx=0, enable_padding=True):
+    # Parse landmarks.
+    lm = np.array(parts)
+    lm_chin          = lm[0: 17]  # left-right
+    lm_eyebrow_left = lm[17: 22]  # left-right
+    lm_eyebrow_right = lm[22: 27]  # left-right
+    lm_nose = lm[27: 31]  # top-down
+    lm_nostrils = lm[31: 36]  # top-down
+    lm_eye_left = lm[36: 42]  # left-clockwise
+    lm_eye_right = lm[42: 48]  # left-clockwise
+    lm_mouth_outer = lm[48: 60]  # left-clockwise
+    lm_mouth_inner = lm[60: 68]  # left-clockwise
+
+    # Calculate auxiliary vectors.
+    eye_left = np.mean(lm_eye_left, axis=0)
+    eye_right = np.mean(lm_eye_right, axis=0)
+    eye_avg = (eye_left + eye_right) * 0.5
+    eye_to_eye = eye_right - eye_left
+    mouth_left = lm_mouth_outer[0]
+    mouth_right = lm_mouth_outer[6]
+    mouth_avg = (mouth_left + mouth_right) * 0.5
+    eye_to_mouth = mouth_avg - eye_avg
+
+    # Choose oriented crop rectangle.
+    x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+    x /= np.hypot(*x)
+
+    if use_1024:
+        x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
+    else:
+        x *= (np.hypot(*eye_to_eye) * 1.6410 + np.hypot(*eye_to_mouth) * 1.560) / 2.0
+
+    y = np.flipud(x) * [-1, 1]
+
+    if use_1024:
+        c = eye_avg + eye_to_mouth * 0.1
+        quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+    else:
+        c = eye_avg + eye_to_mouth * 0.317
+        quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+
+    qsize = np.hypot(*x) * 2
+
+    img = Image.fromarray(img)
+
+    # Shrink.
+    shrink = int(np.floor(qsize / output_size * 0.5))
+    if shrink > 1:
+        rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
+        img = img.resize(rsize, PIL.Image.ANTIALIAS)
+        quad /= shrink
+        qsize /= shrink
+
+    # Crop.
+    border = max(int(np.rint(qsize * 0.1)), 3)
+    crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+            int(np.ceil(max(quad[:, 1]))))
+    crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]),
+            min(crop[3] + border, img.size[1]))
+    if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
+        img = img.crop(crop)
+        quad -= crop[0:2]
+
+    # Pad.
+    pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+           int(np.ceil(max(quad[:, 1]))))
+    pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0),
+           max(pad[3] - img.size[1] + border, 0))
+    if enable_padding and max(pad) > border - 4:
+        pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
+        img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
+        h, w, _ = img.shape
+        y, x, _ = np.ogrid[:h, :w, :1]
+        mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
+                          1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3]))
+        blur = qsize * 0.02
+        img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
+        img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
+        img = PIL.Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
+        quad += pad[:2]
+
+    # Transform.
+    img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
+    if output_size < transform_size:
+        img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS)
+
+    # Save aligned image.
+    dst_subdir = dst_dir
+    os.makedirs(dst_subdir, exist_ok=True)
+    img.save(os.path.join(dst_subdir, '%05d.png' % item_idx))
+
+
+predictor_path = 'shape_predictor_68_face_landmarks.dat'
+
+detector = dlib.get_frontal_face_detector()
+predictor = dlib.shape_predictor(predictor_path)
+
+item_idx = 0
+
+for filename in os.listdir('custom'):
+    img = np.asarray(Image.open('custom/' + filename))
+    if img.shape[2] == 4:
+        img = img[:, :, :3]
+
+    dets = detector(img, 0)
+    print("Number of faces detected: {}".format(len(dets)))
+
+    for i, d in enumerate(dets):
+        print("Detection {}: Left: {} Top: {} Right: {} Bottom: {}".format(
+        i, d.left(), d.top(), d.right(), d.bottom()))
+
+        shape = predictor(img, d)
+
+        parts = shape.parts()
+
+        parts = [[part.x, part.y] for part in parts]
+
+        if use_1024:
+            align(img, parts, dst_dir='dataset_samples/faces/realign_custom1024x1024', output_size=1024, transform_size=4098, item_idx=item_idx)
+        else:
+            align(img, parts, dst_dir='dataset_samples/faces/realign_custom128x128', output_size=128, transform_size=512, item_idx=item_idx)
+
+        item_idx += 1
diff --git a/interactive_demo_custom.py b/interactive_demo_custom.py
new file mode 100644
index 00000000..a4e45d11
--- /dev/null
+++ b/interactive_demo_custom.py
@@ -0,0 +1,254 @@
+# Copyright 2019-2020 Stanislav Pidhorskyi
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import torch.utils.data
+from net import *
+from model import Model
+from launcher import run
+from checkpointer import Checkpointer
+from dlutils.pytorch import count_parameters
+from defaults import get_cfg_defaults
+import lreq
+
+from PIL import Image
+import bimpy
+
+
+lreq.use_implicit_lreq.set(True)
+
+
+indices = [0, 1, 2, 3, 4, 10, 11, 17, 19]
+
+labels = ["gender",
+          "smile",
+          "attractive",
+          "wavy-hair",
+          "young",
+          "big lips",
+          "big nose",
+          "chubby",
+          "glasses",
+          ]
+
+
+def sample(cfg, logger):
+    torch.cuda.set_device(0)
+    model = Model(
+        startf=cfg.MODEL.START_CHANNEL_COUNT,
+        layer_count=cfg.MODEL.LAYER_COUNT,
+        maxf=cfg.MODEL.MAX_CHANNEL_COUNT,
+        latent_size=cfg.MODEL.LATENT_SPACE_SIZE,
+        truncation_psi=cfg.MODEL.TRUNCATIOM_PSI,
+        truncation_cutoff=cfg.MODEL.TRUNCATIOM_CUTOFF,
+        mapping_layers=cfg.MODEL.MAPPING_LAYERS,
+        channels=cfg.MODEL.CHANNELS,
+        generator=cfg.MODEL.GENERATOR,
+        encoder=cfg.MODEL.ENCODER)
+    model.cuda(0)
+    model.eval()
+    model.requires_grad_(False)
+
+    decoder = model.decoder
+    encoder = model.encoder
+    mapping_tl = model.mapping_tl
+    mapping_fl = model.mapping_fl
+    dlatent_avg = model.dlatent_avg
+
+    logger.info("Trainable parameters generator:")
+    count_parameters(decoder)
+
+    logger.info("Trainable parameters discriminator:")
+    count_parameters(encoder)
+
+    arguments = dict()
+    arguments["iteration"] = 0
+
+    model_dict = {
+        'discriminator_s': encoder,
+        'generator_s': decoder,
+        'mapping_tl_s': mapping_tl,
+        'mapping_fl_s': mapping_fl,
+        'dlatent_avg': dlatent_avg
+    }
+
+    checkpointer = Checkpointer(cfg,
+                                model_dict,
+                                {},
+                                logger=logger,
+                                save=False)
+
+    extra_checkpoint_data = checkpointer.load()
+
+    model.eval()
+
+    layer_count = cfg.MODEL.LAYER_COUNT
+
+    def encode(x):
+        Z, _ = model.encode(x, layer_count - 1, 1)
+        Z = Z.repeat(1, model.mapping_fl.num_layers, 1)
+        return Z
+
+    def decode(x):
+        layer_idx = torch.arange(2 * layer_count)[np.newaxis, :, np.newaxis]
+        ones = torch.ones(layer_idx.shape, dtype=torch.float32)
+        coefs = torch.where(layer_idx < model.truncation_cutoff, ones, ones)
+        # x = torch.lerp(model.dlatent_avg.buff.data, x, coefs)
+        return model.decoder(x, layer_count - 1, 1, noise=True)
+
+    path = 'dataset_samples/faces/realign_custom1024x1024'
+
+    paths = list(os.listdir(path))
+    paths.sort()
+    paths_backup = paths[:]
+    randomize = bimpy.Bool(True)
+    current_file = bimpy.String("")
+
+    ctx = bimpy.Context()
+
+    attribute_values = [bimpy.Float(0) for i in indices]
+
+    W = [torch.tensor(np.load("principal_directions/direction_%d.npy" % i), dtype=torch.float32) for i in indices]
+
+    rnd = np.random.RandomState(5)
+
+    def loadNext():
+        img = np.asarray(Image.open(path + '/' + paths[0]))
+        current_file.value = paths[0]
+        paths.pop(0)
+        if len(paths) == 0:
+            paths.extend(paths_backup)
+
+        if img.shape[2] == 4:
+            img = img[:, :, :3]
+        im = img.transpose((2, 0, 1))
+        x = torch.tensor(np.asarray(im, dtype=np.float32), device='cpu', requires_grad=True).cuda() / 127.5 - 1.
+        if x.shape[0] == 4:
+            x = x[:3]
+
+        needed_resolution = model.decoder.layer_to_resolution[-1]
+        while x.shape[2] > needed_resolution:
+            x = F.avg_pool2d(x, 2, 2)
+        if x.shape[2] != needed_resolution:
+            x = F.adaptive_avg_pool2d(x, (needed_resolution, needed_resolution))
+
+        img_src = ((x * 0.5 + 0.5) * 255).type(torch.long).clamp(0, 255).cpu().type(torch.uint8).transpose(0, 2).transpose(0, 1).numpy()
+
+        latents_original = encode(x[None, ...].cuda())
+        latents = latents_original[0, 0].clone()
+        latents -= model.dlatent_avg.buff.data[0]
+
+        for v, w in zip(attribute_values, W):
+            v.value = (latents * w).sum()
+
+        for v, w in zip(attribute_values, W):
+            latents = latents - v.value * w
+
+        return latents, latents_original, img_src
+
+    def loadRandom():
+        latents = rnd.randn(1, cfg.MODEL.LATENT_SPACE_SIZE)
+        lat = torch.tensor(latents).float().cuda()
+        dlat = mapping_fl(lat)
+        layer_idx = torch.arange(2 * layer_count)[np.newaxis, :, np.newaxis]
+        ones = torch.ones(layer_idx.shape, dtype=torch.float32)
+        coefs = torch.where(layer_idx < model.truncation_cutoff, ones, ones)
+        dlat = torch.lerp(model.dlatent_avg.buff.data, dlat, coefs)
+        x = decode(dlat)[0]
+        img_src = ((x * 0.5 + 0.5) * 255).type(torch.long).clamp(0, 255).cpu().type(torch.uint8).transpose(0, 2).transpose(0, 1).numpy()
+        latents_original = dlat
+        latents = latents_original[0, 0].clone()
+        latents -= model.dlatent_avg.buff.data[0]
+
+        for v, w in zip(attribute_values, W):
+            v.value = (latents * w).sum()
+
+        for v, w in zip(attribute_values, W):
+            latents = latents - v.value * w
+
+        return latents, latents_original, img_src
+
+    latents, latents_original, img_src = loadNext()
+
+    ctx.init(1800, 1600, "Styles")
+
+    def update_image(w, latents_original):
+        with torch.no_grad():
+            w = w + model.dlatent_avg.buff.data[0]
+            w = w[None, None, ...].repeat(1, model.mapping_fl.num_layers, 1)
+
+            layer_idx = torch.arange(model.mapping_fl.num_layers)[np.newaxis, :, np.newaxis]
+            cur_layers = (7 + 1) * 2
+            mixing_cutoff = cur_layers
+            styles = torch.where(layer_idx < mixing_cutoff, w, latents_original)
+
+            x_rec = decode(styles)
+            resultsample = ((x_rec * 0.5 + 0.5) * 255).type(torch.long).clamp(0, 255)
+            resultsample = resultsample.cpu()[0, :, :, :]
+            return resultsample.type(torch.uint8).transpose(0, 2).transpose(0, 1)
+
+    im_size = 2 ** (cfg.MODEL.LAYER_COUNT + 1)
+    im = update_image(latents, latents_original)
+    print(im.shape)
+    im = bimpy.Image(im)
+
+    display_original = True
+
+    seed = 0
+
+    while not ctx.should_close():
+        with ctx:
+            new_latents = latents + sum([v.value * w for v, w in zip(attribute_values, W)])
+
+            if display_original:
+                im = bimpy.Image(img_src)
+            else:
+                im = bimpy.Image(update_image(new_latents, latents_original))
+
+            bimpy.begin("Principal directions")
+            bimpy.columns(2)
+            bimpy.set_column_width(0, im_size + 20)
+            bimpy.image(im)
+            bimpy.next_column()
+
+            for v, label in zip(attribute_values, labels):
+                bimpy.slider_float(label, v, -40.0, 40.0)
+
+            bimpy.checkbox("Randomize noise", randomize)
+
+            if randomize.value:
+                seed += 1
+
+            torch.manual_seed(seed)
+
+            if bimpy.button('Next'):
+                latents, latents_original, img_src = loadNext()
+                display_original = True
+            if bimpy.button('Display Reconstruction'):
+                display_original = False
+            if bimpy.button('Generate random'):
+                latents, latents_original, img_src = loadRandom()
+                display_original = False
+
+            if bimpy.input_text("Current file", current_file, 64) and os.path.exists(path + '/' + current_file.value):
+                paths.insert(0, current_file.value)
+                latents, latents_original, img_src = loadNext()
+
+            bimpy.end()
+
+
+if __name__ == "__main__":
+    gpu_count = 1
+    run(sample, get_cfg_defaults(), description='ALAE-interactive', default_config='configs/ffhq.yaml',
+        world_size=gpu_count, write_log=False)