antgroup · CodeLyokoscj · Aug 5, 2024 · Aug 5, 2024
diff --git a/src/models/mutual_self_attention.py b/src/models/mutual_self_attention.py
@@ -80,13 +80,11 @@ def register_reference_hooks(
                     [1] * batch_size * num_images_per_prompt * 16
                     + [0] * batch_size * num_images_per_prompt * 16
                 )
-                .to(device)
                 .bool()
             )
         else:
             uc_mask = (
                 torch.Tensor([0] * batch_size * num_images_per_prompt * 2)
-                .to(device)
                 .bool()
             )
 
@@ -170,7 +168,6 @@ def hacked_basic_transformer_inner_forward(
                                     [1] * (hidden_states.shape[0] // 2)
                                     + [0] * (hidden_states.shape[0] // 2)
                                 )
-                                .to(device)
                                 .bool()
                             )
                         hidden_states_c[_uc_mask] = (

diff --git a/webgui.py b/webgui.py
@@ -67,7 +67,7 @@
 
 ############# model_init started #############
 ## vae init
-vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype)
+vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to(device, dtype=weight_dtype)
 
 ## reference net init
 reference_unet = UNet2DConditionModel.from_pretrained(
@@ -101,7 +101,7 @@
 denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False)
 
 ## face locator init
-face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")
+face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device=device)
 face_locator.load_state_dict(torch.load(config.face_locator_path))
 
 ## load audio processor params
@@ -122,7 +122,7 @@
     audio_guider=audio_processor,
     face_locator=face_locator,
     scheduler=scheduler,
-).to("cuda", dtype=weight_dtype)
+).to(device, dtype=weight_dtype)
 
 def select_face(det_bboxes, probs):
     ## max face from faces that the prob is above 0.8
@@ -170,7 +170,7 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
         face_mask = cv2.resize(face_mask, (width, height))
 
     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
-    face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
+    face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device=device).unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
 
     video = pipe(
         ref_image_pil,