From 90601b82b194f59d044875553ae381cddfbff5f7 Mon Sep 17 00:00:00 2001
From: Fate_nihility <61657922+CodeLyokoscj@users.noreply.github.com>
Date: Mon, 5 Aug 2024 12:12:05 +0800
Subject: [PATCH 1/2] Update webgui.py for gpu settings

modify "cuda -> device" to provide unified format for changing gpu settings
---
 webgui.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/webgui.py b/webgui.py
index 5e46781..b12eeb4 100644
--- a/webgui.py
+++ b/webgui.py
@@ -67,7 +67,7 @@
 
 ############# model_init started #############
 ## vae init
-vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype)
+vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to(device, dtype=weight_dtype)
 
 ## reference net init
 reference_unet = UNet2DConditionModel.from_pretrained(
@@ -101,7 +101,7 @@
 denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False)
 
 ## face locator init
-face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")
+face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device=device)
 face_locator.load_state_dict(torch.load(config.face_locator_path))
 
 ## load audio processor params
@@ -122,7 +122,7 @@
     audio_guider=audio_processor,
     face_locator=face_locator,
     scheduler=scheduler,
-).to("cuda", dtype=weight_dtype)
+).to(device, dtype=weight_dtype)
 
 def select_face(det_bboxes, probs):
     ## max face from faces that the prob is above 0.8
@@ -170,7 +170,7 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
         face_mask = cv2.resize(face_mask, (width, height))
 
     ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
-    face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
+    face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device=device).unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
     
     video = pipe(
         ref_image_pil,

From a4e9973dd0fa3d59ffe3b9cc34885fe60d229501 Mon Sep 17 00:00:00 2001
From: Fate_nihility <61657922+CodeLyokoscj@users.noreply.github.com>
Date: Mon, 5 Aug 2024 12:18:13 +0800
Subject: [PATCH 2/2] Update mutual_self_attention.py for gpu settings

gpu settings have already written in webgui.py:
pipe = Audio2VideoPipeline(
    vae=vae,
    reference_unet=reference_unet,
    denoising_unet=denoising_unet,
    audio_guider=audio_processor,
    face_locator=face_locator,
    scheduler=scheduler,
).to(device, dtype=weight_dtype)
Thus, redundant device settings for tensors are not needed in mutual_self_attention.py (which may cause index error on tensors).
---
 src/models/mutual_self_attention.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/models/mutual_self_attention.py b/src/models/mutual_self_attention.py
index b5d4a7f..e3e57e9 100644
--- a/src/models/mutual_self_attention.py
+++ b/src/models/mutual_self_attention.py
@@ -80,13 +80,11 @@ def register_reference_hooks(
                     [1] * batch_size * num_images_per_prompt * 16
                     + [0] * batch_size * num_images_per_prompt * 16
                 )
-                .to(device)
                 .bool()
             )
         else:
             uc_mask = (
                 torch.Tensor([0] * batch_size * num_images_per_prompt * 2)
-                .to(device)
                 .bool()
             )
 
@@ -170,7 +168,6 @@ def hacked_basic_transformer_inner_forward(
                                     [1] * (hidden_states.shape[0] // 2)
                                     + [0] * (hidden_states.shape[0] // 2)
                                 )
-                                .to(device)
                                 .bool()
                             )
                         hidden_states_c[_uc_mask] = (