bytedance
diff --git a/‎configs/unet/stage1.yaml
-1 b/‎configs/unet/stage1.yaml
-1
diff --git a/‎configs/unet/stage2.yaml
-1 b/‎configs/unet/stage2.yaml
-1
diff --git a/‎configs/unet/stage2_efficient.yaml
-1 b/‎configs/unet/stage2_efficient.yaml
-1
diff --git a/‎latentsync/data/syncnet_dataset.py
+1-1 b/‎latentsync/data/syncnet_dataset.py
+1-1
diff --git a/‎latentsync/data/unet_dataset.py
+1-1 b/‎latentsync/data/unet_dataset.py
+1-1
diff --git a/‎latentsync/pipelines/lipsync_pipeline.py
+1-2 b/‎latentsync/pipelines/lipsync_pipeline.py
+1-2
diff --git a/‎latentsync/utils/affine_transform.py
+2-2 b/‎latentsync/utils/affine_transform.py
+2-2
diff --git a/‎latentsync/utils/image_processor.py
+32-241 b/‎latentsync/utils/image_processor.py
+32-241
@@ -12,7 +12,6 @@ data:
   num_workers: 12 # 12
   num_frames: 16
   resolution: 256
-  mask: fix_mask
   mask_image_path: latentsync/utils/mask.png
   audio_sample_rate: 16000
   video_fps: 25
 
@@ -12,7 +12,6 @@ data:
   num_workers: 12 # 12
   num_frames: 16
   resolution: 256
-  mask: fix_mask
   mask_image_path: latentsync/utils/mask.png
   audio_sample_rate: 16000
   video_fps: 25
 
@@ -12,7 +12,6 @@ data:
   num_workers: 12 # 12
   num_frames: 16
   resolution: 256
-  mask: fix_mask
   mask_image_path: latentsync/utils/mask.png
   audio_sample_rate: 16000
   video_fps: 25
 
@@ -42,7 +42,7 @@ def __init__(self, data_dir: str, fileslist: str, config):
 
         self.audio_sample_rate = config.data.audio_sample_rate
         self.video_fps = config.data.video_fps
-        self.image_processor = ImageProcessor(resolution=config.data.resolution, mask="half")
+        self.image_processor = ImageProcessor(resolution=config.data.resolution)
         self.audio_mel_cache_dir = config.data.audio_mel_cache_dir
         os.makedirs(self.audio_mel_cache_dir, exist_ok=True)
 
 
@@ -89,7 +89,7 @@ def worker_init_fn(self, worker_id):
         setattr(
             self,
             f"image_processor_{worker_id}",
-            ImageProcessor(self.resolution, self.mask, mask_image=self.mask_image),
+            ImageProcessor(self.resolution, mask_image=self.mask_image),
         )
 
     def __getitem__(self, idx):
 
@@ -328,7 +328,6 @@ def __call__(
         guidance_scale: float = 1.5,
         weight_dtype: Optional[torch.dtype] = torch.float16,
         eta: float = 0.0,
-        mask: str = "fix_mask",
         mask_image_path: str = "latentsync/utils/mask.png",
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -344,7 +343,7 @@ def __call__(
         batch_size = 1
         device = self._execution_device
         mask_image = load_fixed_mask(height, mask_image_path)
-        self.image_processor = ImageProcessor(height, mask=mask, device="cuda", mask_image=mask_image)
+        self.image_processor = ImageProcessor(height, device="cuda", mask_image=mask_image)
         self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
 
         # 1. Default height and width to unet
 
@@ -8,10 +8,10 @@
 
 
 class AlignRestore(object):
-    def __init__(self, align_points=3, device="cpu", dtype=torch.float32):
+    def __init__(self, align_points=3, resolution=256, device="cpu", dtype=torch.float32):
         if align_points == 3:
             self.upscale_factor = 1
-            ratio = 2.8
+            ratio = resolution / 256 * 2.8
             self.crop_ratio = (ratio, ratio)
             self.face_template = np.array([[19 - 2, 30 - 10], [56 + 2, 30 - 10], [37.5, 45 - 5]])
             self.face_template = self.face_template * ratio
 
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from latentsync.utils.util import read_video, write_video
 from torchvision import transforms
 import cv2
 from einops import rearrange
-import mediapipe as mp
 import torch
 import numpy as np
 from typing import Union
@@ -32,90 +32,31 @@ def load_fixed_mask(resolution: int, mask_image_path="latentsync/utils/mask.png"
 
 
 class ImageProcessor:
-    def __init__(self, resolution: int = 512, mask: str = "fix_mask", device: str = "cpu", mask_image=None):
+    def __init__(self, resolution: int = 512, device: str = "cpu", mask_image=None):
         self.resolution = resolution
         self.resize = transforms.Resize(
-            (resolution, resolution), interpolation=transforms.InterpolationMode.BILINEAR, antialias=True
+            (resolution, resolution), interpolation=transforms.InterpolationMode.BICUBIC, antialias=True
         )
         self.normalize = transforms.Normalize([0.5], [0.5], inplace=True)
-        self.mask = mask
 
-        if mask in ["mouth", "face", "eye"]:
-            self.face_mesh = mp.solutions.face_mesh.FaceMesh(static_image_mode=True)  # Process single image
-        if mask == "fix_mask":
-            self.face_mesh = None
-            self.restorer = AlignRestore(device=device)
+        self.restorer = AlignRestore(resolution=resolution, device=device)
 
-            if mask_image is None:
-                self.mask_image = load_fixed_mask(resolution)
-            else:
-                self.mask_image = mask_image
-
-            if device != "cpu":
-                self.face_detector = FaceDetector(device=device)
-                self.face_mesh = None
-            else:
-                # self.face_mesh = mp.solutions.face_mesh.FaceMesh(static_image_mode=True)  # Process single image
-                self.face_mesh = None
-                self.face_detector = None
-
-    def detect_facial_landmarks(self, image: np.ndarray):
-        height, width, _ = image.shape
-        results = self.face_mesh.process(image)
-        if not results.multi_face_landmarks:  # Face not detected
-            raise RuntimeError("Face not detected")
-        face_landmarks = results.multi_face_landmarks[0]  # Only use the first face in the image
-        landmark_coordinates = [
-            (int(landmark.x * width), int(landmark.y * height)) for landmark in face_landmarks.landmark
-        ]  # x means width, y means height
-        return landmark_coordinates
-
-    def preprocess_one_masked_image(self, image: torch.Tensor) -> np.ndarray:
-        image = self.resize(image)
-
-        if self.mask == "mouth" or self.mask == "face":
-            landmark_coordinates = self.detect_facial_landmarks(image)
-            if self.mask == "mouth":
-                surround_landmarks = mouth_surround_landmarks
-            else:
-                surround_landmarks = face_surround_landmarks
-
-            points = [landmark_coordinates[landmark] for landmark in surround_landmarks]
-            points = np.array(points)
-            mask = np.ones((self.resolution, self.resolution))
-            mask = cv2.fillPoly(mask, pts=[points], color=(0, 0, 0))
-            mask = torch.from_numpy(mask)
-            mask = mask.unsqueeze(0)
-        elif self.mask == "half":
-            mask = torch.ones((self.resolution, self.resolution))
-            height = mask.shape[0]
-            mask[height // 2 :, :] = 0
-            mask = mask.unsqueeze(0)
-        elif self.mask == "eye":
-            mask = torch.ones((self.resolution, self.resolution))
-            landmark_coordinates = self.detect_facial_landmarks(image)
-            y = landmark_coordinates[195][1]
-            mask[y:, :] = 0
-            mask = mask.unsqueeze(0)
+        if mask_image is None:
+            self.mask_image = load_fixed_mask(resolution)
         else:
-            raise ValueError("Invalid mask type")
+            self.mask_image = mask_image
 
-        image = image.to(dtype=torch.float32)
-        pixel_values = self.normalize(image / 255.0)
-        masked_pixel_values = pixel_values * mask
-        mask = 1 - mask
-
-        return pixel_values, masked_pixel_values, mask
+        if device == "cpu":
+            self.face_detector = None
+        else:
+            self.face_detector = FaceDetector(device=device)
 
     def affine_transform(self, image: torch.Tensor) -> np.ndarray:
-        # image = rearrange(image, "c h w-> h w c").numpy()
         if self.face_detector is None:
-            landmark_coordinates = np.array(self.detect_facial_landmarks(image))
-            lm68 = mediapipe_lm478_to_face_alignment_lm68(landmark_coordinates)
-        else:
-            bbox, landmark_2d_106 = self.face_detector(image)
-            if bbox is None:
-                raise RuntimeError("Face not detected")
+            raise NotImplementedError("Using the CPU for face detection is not supported")
+        bbox, landmark_2d_106 = self.face_detector(image)
+        if bbox is None:
+            raise RuntimeError("Face not detected")
 
         pt_left_eye = np.mean(landmark_2d_106[[43, 48, 49, 51, 50]], axis=0)  # left eyebrow center
         pt_right_eye = np.mean(landmark_2d_106[101:106], axis=0)  # right eyebrow center
@@ -153,10 +94,8 @@ def prepare_masks_and_masked_images(self, images: Union[torch.Tensor, np.ndarray
             images = torch.from_numpy(images)
         if images.shape[3] == 3:
             images = rearrange(images, "f h w c -> f c h w")
-        if self.mask == "fix_mask":
-            results = [self.preprocess_fixed_mask_image(image, affine_transform=affine_transform) for image in images]
-        else:
-            results = [self.preprocess_one_masked_image(image) for image in images]
+
+        results = [self.preprocess_fixed_mask_image(image, affine_transform=affine_transform) for image in images]
 
         pixel_values_list, masked_pixel_values_list, masks_list = list(zip(*results))
         return torch.stack(pixel_values_list), torch.stack(masked_pixel_values_list), torch.stack(masks_list)
@@ -170,172 +109,24 @@ def process_images(self, images: Union[torch.Tensor, np.ndarray]):
         pixel_values = self.normalize(images / 255.0)
         return pixel_values
 
-    def close(self):
-        if self.face_mesh is not None:
-            self.face_mesh.close()
-
-
-def mediapipe_lm478_to_face_alignment_lm68(lm478, return_2d=True):
-    """
-    lm478: [B, 478, 3] or [478,3]
-    """
-    # lm478[..., 0] *= W
-    # lm478[..., 1] *= H
-    landmarks_extracted = []
-    for index in landmark_points_68:
-        x = lm478[index][0]
-        y = lm478[index][1]
-        landmarks_extracted.append((x, y))
-    return np.array(landmarks_extracted)
-
 
-landmark_points_68 = [
-    162,
-    234,
-    93,
-    58,
-    172,
-    136,
-    149,
-    148,
-    152,
-    377,
-    378,
-    365,
-    397,
-    288,
-    323,
-    454,
-    389,
-    71,
-    63,
-    105,
-    66,
-    107,
-    336,
-    296,
-    334,
-    293,
-    301,
-    168,
-    197,
-    5,
-    4,
-    75,
-    97,
-    2,
-    326,
-    305,
-    33,
-    160,
-    158,
-    133,
-    153,
-    144,
-    362,
-    385,
-    387,
-    263,
-    373,
-    380,
-    61,
-    39,
-    37,
-    0,
-    267,
-    269,
-    291,
-    405,
-    314,
-    17,
-    84,
-    181,
-    78,
-    82,
-    13,
-    312,
-    308,
-    317,
-    14,
-    87,
-]
+class VideoProcessor:
+    def __init__(self, resolution: int = 512, device: str = "cpu"):
+        self.image_processor = ImageProcessor(resolution, device)
 
+    def affine_transform_video(self, video_path):
+        video_frames = read_video(video_path, change_fps=False)
+        results = []
+        for frame in video_frames:
+            frame, _, _ = self.image_processor.affine_transform(frame)
+            results.append(frame)
+        results = torch.stack(results)
 
-# Refer to https://storage.googleapis.com/mediapipe-assets/documentation/mediapipe_face_landmark_fullsize.png
-mouth_surround_landmarks = [
-    164,
-    165,
-    167,
-    92,
-    186,
-    57,
-    43,
-    106,
-    182,
-    83,
-    18,
-    313,
-    406,
-    335,
-    273,
-    287,
-    410,
-    322,
-    391,
-    393,
-]
+        results = rearrange(results, "f c h w -> f h w c").numpy()
+        return results
 
-face_surround_landmarks = [
-    152,
-    377,
-    400,
-    378,
-    379,
-    365,
-    397,
-    288,
-    435,
-    433,
-    411,
-    425,
-    423,
-    327,
-    326,
-    94,
-    97,
-    98,
-    203,
-    205,
-    187,
-    213,
-    215,
-    58,
-    172,
-    136,
-    150,
-    149,
-    176,
-    148,
-]
 
 if __name__ == "__main__":
-    image_processor = ImageProcessor(512, mask="fix_mask")
-    video = cv2.VideoCapture("assets/demo1_video.mp4")
-    while True:
-        ret, frame = video.read()
-        # if not ret:
-        #     break
-
-        # cv2.imwrite("image.jpg", frame)
-
-        frame = rearrange(torch.Tensor(frame).type(torch.uint8), "h w c ->  c h w")
-        # face, masked_face, _ = image_processor.preprocess_fixed_mask_image(frame, affine_transform=True)
-        face, _, _ = image_processor.affine_transform(frame)
-
-        break
-
-    face = (rearrange(face, "c h w -> h w c").detach().cpu().numpy()).astype(np.uint8)
-    cv2.imwrite("face.jpg", face)
-
-    # masked_face = (rearrange(masked_face, "c h w -> h w c").detach().cpu().numpy()).astype(np.uint8)
-    # cv2.imwrite("masked_face.jpg", masked_face)
+    video_processor = VideoProcessor(256, "cuda")
+    video_frames = video_processor.affine_transform_video("validation/flux.mp4")
+    write_video("output.mp4", video_frames, fps=25)
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ def worker_init_fn(self, worker_id):`
`89`	`89`	`setattr(`
`90`	`90`	`self,`
`91`	`91`	`f"image_processor_{worker_id}",`
`92`		`- ImageProcessor(self.resolution, self.mask, mask_image=self.mask_image),`
	`92`	`+ ImageProcessor(self.resolution, mask_image=self.mask_image),`
`93`	`93`	`)`
`94`	`94`
`95`	`95`	`def __getitem__(self, idx):`