2noise
diff --git a/‎.github/workflows/checksum.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/checksum.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/close-issue.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/close-issue.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/pull-format.yml
Lines changed: 9 additions & 1 deletion b/‎.github/workflows/pull-format.yml
Lines changed: 9 additions & 1 deletion
diff --git a/‎.github/workflows/push-format.yml
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/push-format.yml
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/workflows/unitest.yml
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/unitest.yml
Lines changed: 8 additions & 0 deletions
diff --git a/‎ChatTTS/config/config.py
Lines changed: 3 additions & 3 deletions b/‎ChatTTS/config/config.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎ChatTTS/core.py
Lines changed: 28 additions & 33 deletions b/‎ChatTTS/core.py
Lines changed: 28 additions & 33 deletions
diff --git a/‎ChatTTS/model/dvae.py
Lines changed: 11 additions & 4 deletions b/‎ChatTTS/model/dvae.py
Lines changed: 11 additions & 4 deletions
diff --git a/‎ChatTTS/model/embed.py
Lines changed: 4 additions & 6 deletions b/‎ChatTTS/model/embed.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎ChatTTS/model/gpt.py
Lines changed: 1 addition & 1 deletion b/‎ChatTTS/model/gpt.py
Lines changed: 1 addition & 1 deletion
@@ -4,7 +4,7 @@ on:
 
 jobs:
   checksum:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     steps:
       - uses: actions/checkout@v4
 
@@ -13,7 +13,7 @@ jobs:
 
       - name: Run RVC-Models-Downloader
         run: |
-          wget https://github.com/fumiama/RVC-Models-Downloader/releases/download/v0.2.8/rvcmd_linux_amd64.deb
+          wget https://github.com/fumiama/RVC-Models-Downloader/releases/download/v0.2.9/rvcmd_linux_amd64.deb
           sudo apt -y install ./rvcmd_linux_amd64.deb
           rm -f ./rvcmd_linux_amd64.deb
           rvcmd -notrs -w 1 -notui assets/chtts
 
@@ -5,14 +5,14 @@ on:
 
 jobs:
   close-issues:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     permissions:
       issues: write
       pull-requests: write
     steps:
       - uses: actions/stale@v5
         with:
-          exempt-issue-labels: "help wanted,good first issue,documentation,following up,todo list"
+          exempt-issue-labels: "help wanted,following up,todo list,enhancement,algorithm,delayed,performance"
           days-before-issue-stale: 30
           days-before-issue-close: 15
           stale-issue-label: "stale"
 
@@ -8,7 +8,7 @@ jobs:
   # This workflow closes invalid PR
   change-or-close-pr:
     # The type of runner that the job will run on
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     permissions: write-all
 
     # Steps represent a sequence of tasks that will be executed as part of the job
@@ -63,6 +63,14 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
 
+      - name: Create venv
+        run: python3 -m venv .venv
+
+      - name: Activate venv
+        run: |
+          . .venv/bin/activate
+          echo PATH=$PATH >> $GITHUB_ENV
+
       - name: Install Black
         run: pip install "black[jupyter]"
 
 
@@ -24,6 +24,14 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
 
+      - name: Create venv
+        run: python3 -m venv .venv
+
+      - name: Activate venv
+        run: |
+          . .venv/bin/activate
+          echo PATH=$PATH >> $GITHUB_ENV
+
       - name: Install Black
         run: pip install "black[jupyter]"
 
 
@@ -25,6 +25,14 @@ jobs:
         run: |
           sudo apt-get install -y portaudio19-dev python3-pyaudio
 
+      - name: Create venv
+        run: python3 -m venv .venv
+
+      - name: Activate venv
+        run: |
+          . .venv/bin/activate
+          echo PATH=$PATH >> $GITHUB_ENV
+
       - name: Test Install
         run: pip install .
 
 
@@ -3,10 +3,10 @@
 
 @dataclass(repr=False, eq=False)
 class Path:
-    vocos_ckpt_path: str = "asset/Vocos.pt"
-    dvae_ckpt_path: str = "asset/DVAE_full.pt"
+    vocos_ckpt_path: str = "asset/Vocos.safetensors"
+    dvae_ckpt_path: str = "asset/DVAE.safetensors"
     gpt_ckpt_path: str = "asset/gpt"
-    decoder_ckpt_path: str = "asset/Decoder.pt"
+    decoder_ckpt_path: str = "asset/Decoder.safetensors"
     tokenizer_path: str = "asset/tokenizer"
     embed_path: str = "asset/Embed.safetensors"
 
 
@@ -15,6 +15,7 @@
 from .config import Config
 from .model import DVAE, Embed, GPT, gen_logits, Tokenizer, Speaker
 from .utils import (
+    load_safetensors,
     check_all_assets,
     download_all_assets,
     select_device,
@@ -97,7 +98,7 @@ def download_models(
                 try:
                     download_path = snapshot_download(
                         repo_id="2Noise/ChatTTS",
-                        allow_patterns=["*.pt", "*.yaml", "*.json", "*.safetensors"],
+                        allow_patterns=["*.yaml", "*.json", "*.safetensors"],
                     )
                 except:
                     download_path = None
@@ -253,34 +254,34 @@ def _load(
         vocos = (
             Vocos(feature_extractor=feature_extractor, backbone=backbone, head=head)
             .to(
-                # vocos on mps will crash, use cpu fallback
+                # Vocos on mps will crash, use cpu fallback.
+                # Plus, complex dtype used in the decode process of Vocos is not supported in torch_npu now,
+                # so we put this calculation of data on CPU instead of NPU.
                 "cpu"
-                if "mps" in str(device)
+                if "mps" in str(device) or "npu" in str(device)
                 else device
             )
             .eval()
         )
         assert vocos_ckpt_path, "vocos_ckpt_path should not be None"
-        vocos.load_state_dict(torch.load(vocos_ckpt_path, weights_only=True, mmap=True))
+        vocos.load_state_dict(load_safetensors(vocos_ckpt_path))
         self.vocos = vocos
         self.logger.log(logging.INFO, "vocos loaded.")
 
-        dvae = (
-            DVAE(
-                decoder_config=asdict(self.config.dvae.decoder),
-                encoder_config=asdict(self.config.dvae.encoder),
-                vq_config=asdict(self.config.dvae.vq),
-                dim=self.config.dvae.decoder.idim,
-                coef=coef,
-                device=device,
-            )
-            .to(device)
-            .eval()
+        # computation of MelSpectrogram on npu is not support now, use cpu fallback.
+        dvae_device = torch.device("cpu") if "npu" in str(self.device) else device
+        dvae = DVAE(
+            decoder_config=asdict(self.config.dvae.decoder),
+            encoder_config=asdict(self.config.dvae.encoder),
+            vq_config=asdict(self.config.dvae.vq),
+            dim=self.config.dvae.decoder.idim,
+            coef=coef,
+            device=dvae_device,
         )
         coef = str(dvae)
         assert dvae_ckpt_path, "dvae_ckpt_path should not be None"
-        dvae.load_state_dict(torch.load(dvae_ckpt_path, weights_only=True, mmap=True))
-        self.dvae = dvae
+        dvae.load_pretrained(dvae_ckpt_path, dvae_device)
+        self.dvae = dvae.eval()
         self.logger.log(logging.INFO, "dvae loaded.")
 
         embed = Embed(
@@ -289,7 +290,7 @@ def _load(
             self.config.embed.num_text_tokens,
             self.config.embed.num_vq,
         )
-        embed.from_pretrained(embed_path, device=device)
+        embed.load_pretrained(embed_path, device=device)
         self.embed = embed.to(device)
         self.logger.log(logging.INFO, "embed loaded.")
 
@@ -303,7 +304,7 @@ def _load(
             logger=self.logger,
         ).eval()
         assert gpt_ckpt_path, "gpt_ckpt_path should not be None"
-        gpt.from_pretrained(gpt_ckpt_path, embed_path, experimental=experimental)
+        gpt.load_pretrained(gpt_ckpt_path, embed_path, experimental=experimental)
         gpt.prepare(compile=compile and "cuda" in str(device))
         self.gpt = gpt
         self.logger.log(logging.INFO, "gpt loaded.")
@@ -313,22 +314,16 @@ def _load(
         )
         self.logger.log(logging.INFO, "speaker loaded.")
 
-        decoder = (
-            DVAE(
-                decoder_config=asdict(self.config.decoder),
-                dim=self.config.decoder.idim,
-                coef=coef,
-                device=device,
-            )
-            .to(device)
-            .eval()
+        decoder = DVAE(
+            decoder_config=asdict(self.config.decoder),
+            dim=self.config.decoder.idim,
+            coef=coef,
+            device=device,
         )
         coef = str(decoder)
         assert decoder_ckpt_path, "decoder_ckpt_path should not be None"
-        decoder.load_state_dict(
-            torch.load(decoder_ckpt_path, weights_only=True, mmap=True)
-        )
-        self.decoder = decoder
+        decoder.load_pretrained(decoder_ckpt_path, device)
+        self.decoder = decoder.eval()
         self.logger.log(logging.INFO, "decoder loaded.")
 
         if tokenizer_path:
@@ -422,7 +417,7 @@ def _infer(
 
     @torch.inference_mode()
     def _vocos_decode(self, spec: torch.Tensor) -> np.ndarray:
-        if "mps" in str(self.device):
+        if "mps" in str(self.device) or "npu" in str(self.device):
             return self.vocos.decode(spec.cpu()).cpu().numpy()
         else:
             return self.vocos.decode(spec).cpu().numpy()
 
@@ -5,10 +5,11 @@
 import pybase16384 as b14
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 import torchaudio
 from vector_quantize_pytorch import GroupedResidualFSQ
 
+from ..utils import load_safetensors
+
 
 class ConvNeXtBlock(nn.Module):
     def __init__(
@@ -36,7 +37,7 @@ def __init__(
         )  # pointwise/1x1 convs, implemented with linear layers
         self.act = nn.GELU()
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
-        self.gamma = (
+        self.weight = (
             nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
             if layer_scale_init_value > 0
             else None
@@ -55,8 +56,8 @@ def forward(self, x: torch.Tensor, cond=None) -> torch.Tensor:
         del y
         y = self.pwconv2(x)
         del x
-        if self.gamma is not None:
-            y *= self.gamma
+        if self.weight is not None:
+            y *= self.weight
         y.transpose_(1, 2)  # (B, T, C) -> (B, C, T)
 
         x = y + residual
@@ -251,6 +252,12 @@ def __call__(
     ) -> torch.Tensor:
         return super().__call__(inp, mode)
 
+    @torch.inference_mode()
+    def load_pretrained(self, filename: str, device: torch.device):
+        state_dict_tensors = load_safetensors(filename)
+        self.load_state_dict(state_dict_tensors)
+        self.to(device)
+
     @torch.inference_mode()
     def forward(
         self, inp: torch.Tensor, mode: Literal["encode", "decode"] = "decode"
 
@@ -1,8 +1,9 @@
-from safetensors.torch import safe_open
 import torch
 import torch.nn as nn
 from torch.nn.utils.parametrizations import weight_norm
 
+from ..utils import load_safetensors
+
 
 class Embed(nn.Module):
     def __init__(
@@ -34,11 +35,8 @@ def __init__(
         )
 
     @torch.inference_mode()
-    def from_pretrained(self, filename: str, device: torch.device):
-        state_dict_tensors = {}
-        with safe_open(filename, framework="pt") as f:
-            for k in f.keys():
-                state_dict_tensors[k] = f.get_tensor(k)
+    def load_pretrained(self, filename: str, device: torch.device):
+        state_dict_tensors = load_safetensors(filename)
         self.load_state_dict(state_dict_tensors)
         self.to(device)
 
 
@@ -56,7 +56,7 @@ def __init__(
         self.head_text = embed.head_text.__call__
         self.head_code = [hc.__call__ for hc in embed.head_code]
 
-    def from_pretrained(
+    def load_pretrained(
         self, gpt_folder: str, embed_file_path: str, experimental=False
     ):
         if self.is_vllm and platform.system().lower() == "linux":