feat: fine-tuning support for Speaker class (#679)

* Update speaker class * add `inplace` argument * update return type * revert the naming
2noise · Aug 14, 2024 · d93ed8d · d93ed8d
1 parent 69aa900
commit d93ed8d
Showing 1 changed file with 12 additions and 7 deletions.
diff --git a/ChatTTS/model/speaker.py b/ChatTTS/model/speaker.py
@@ -22,16 +22,19 @@ def sample_random(self) -> str:
     def apply(
         self,
         emb: torch.Tensor,
-        spk_emb: str,
+        spk_emb: Union[str, torch.Tensor],
         input_ids: torch.Tensor,
         spk_emb_ids: int,
         device: torch.device,
-    ):
+        inplace: bool = True,
+    ) -> torch.Tensor:
+        if isinstance(spk_emb, str):
+            spk_emb_tensor = torch.from_numpy(self._decode(spk_emb))
+        else:
+            spk_emb_tensor = spk_emb
         n = (
             F.normalize(
-                torch.from_numpy(
-                    self._decode(spk_emb),
-                ),
+                spk_emb_tensor,
                 p=2.0,
                 dim=0,
                 eps=1e-12,
@@ -43,8 +46,10 @@ def apply(
             .expand(emb.shape)
         )
         cond = input_ids.narrow(-1, 0, 1).eq(spk_emb_ids).expand(emb.shape)
-        torch.where(cond, n, emb, out=emb)
-        del cond, n
+        out = torch.where(cond, n, emb, out=emb if inplace else None)
+        if inplace:
+            del cond, n
+        return out
 
     @staticmethod
     @torch.no_grad()