Edresson
diff --git a/‎datasets/Kusal.py
Lines changed: 26 additions & 21 deletions b/‎datasets/Kusal.py
Lines changed: 26 additions & 21 deletions
diff --git a/‎datasets/LJSpeech.py
Lines changed: 20 additions & 16 deletions b/‎datasets/LJSpeech.py
Lines changed: 20 additions & 16 deletions
diff --git a/‎datasets/LJSpeechCached.py
Lines changed: 28 additions & 19 deletions b/‎datasets/LJSpeechCached.py
Lines changed: 28 additions & 19 deletions
diff --git a/‎datasets/TWEB.py
Lines changed: 30 additions & 19 deletions b/‎datasets/TWEB.py
Lines changed: 30 additions & 19 deletions
diff --git a/‎debug_config.py
Lines changed: 0 additions & 3 deletions b/‎debug_config.py
Lines changed: 0 additions & 3 deletions
@@ -8,21 +8,28 @@
 from torch.utils.data import Dataset
 
 from utils.text import text_to_sequence
-from utils.data import (prepare_data, pad_per_step,
-                            prepare_tensor, prepare_stop_target)
+from utils.data import (prepare_data, pad_per_step, prepare_tensor,
+                        prepare_stop_target)
 
 
 class MyDataset(Dataset):
-
-    def __init__(self, root_dir, csv_file, outputs_per_step,
-                 text_cleaner, ap, min_seq_len=0):
+    def __init__(self,
+                 root_dir,
+                 csv_file,
+                 outputs_per_step,
+                 text_cleaner,
+                 ap,
+                 min_seq_len=0):
         self.root_dir = root_dir
         self.wav_dir = os.path.join(root_dir, 'wav')
         self.wav_files = glob.glob(os.path.join(self.wav_dir, '*.wav'))
         self._create_file_dict()
         self.csv_dir = os.path.join(root_dir, csv_file)
         with open(self.csv_dir, "r", encoding="utf8") as f:
-            self.frames = [line.split('\t') for line in f if line.split('\t')[0] in self.wav_files_dict.keys()]
+            self.frames = [
+                line.split('\t') for line in f
+                if line.split('\t')[0] in self.wav_files_dict.keys()
+            ]
         self.outputs_per_step = outputs_per_step
         self.sample_rate = ap.sample_rate
         self.cleaners = text_cleaner
@@ -43,10 +50,8 @@ def load_wav(self, filename):
             print(" !! Cannot read file : {}".format(filename))
 
     def _trim_silence(self, wav):
-         return librosa.effects.trim(
-             wav, top_db=40,
-             frame_length=1024,
-             hop_length=256)[0]
+        return librosa.effects.trim(
+            wav, top_db=40, frame_length=1024, hop_length=256)[0]
 
     def _create_file_dict(self):
         self.wav_files_dict = {}
@@ -87,11 +92,10 @@ def __getitem__(self, idx):
         sidx = self.frames[idx][0]
         sidx_files = self.wav_files_dict[sidx]
         file_name = random.choice(sidx_files)
-        wav_name = os.path.join(self.wav_dir,
-                                file_name)
+        wav_name = os.path.join(self.wav_dir, file_name)
         text = self.frames[idx][2]
-        text = np.asarray(text_to_sequence(
-            text, [self.cleaners]), dtype=np.int32)
+        text = np.asarray(
+            text_to_sequence(text, [self.cleaners]), dtype=np.int32)
         wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
         sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
         return sample
@@ -121,12 +125,13 @@ def collate_fn(self, batch):
             mel_lengths = [m.shape[1] + 1 for m in mel]  # +1 for zero-frame
 
             # compute 'stop token' targets
-            stop_targets = [np.array([0.]*(mel_len-1))
-                            for mel_len in mel_lengths]
+            stop_targets = [
+                np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths
+            ]
 
             # PAD stop targets
-            stop_targets = prepare_stop_target(
-                stop_targets, self.outputs_per_step)
+            stop_targets = prepare_stop_target(stop_targets,
+                                               self.outputs_per_step)
 
             # PAD sequences with largest length of the batch
             text = prepare_data(text).astype(np.int32)
@@ -150,8 +155,8 @@ def collate_fn(self, batch):
             mel_lengths = torch.LongTensor(mel_lengths)
             stop_targets = torch.FloatTensor(stop_targets)
 
-            return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0]
+            return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[
+                0]
 
         raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
-                         found {}"
-                         .format(type(batch[0]))))
+                         found {}".format(type(batch[0]))))
@@ -6,14 +6,18 @@
 from torch.utils.data import Dataset
 
 from utils.text import text_to_sequence
-from utils.data import (prepare_data, pad_per_step,
-                            prepare_tensor, prepare_stop_target)
+from utils.data import (prepare_data, pad_per_step, prepare_tensor,
+                        prepare_stop_target)
 
 
 class MyDataset(Dataset):
-
-    def __init__(self, root_dir, csv_file, outputs_per_step,
-                 text_cleaner, ap, min_seq_len=0):
+    def __init__(self,
+                 root_dir,
+                 csv_file,
+                 outputs_per_step,
+                 text_cleaner,
+                 ap,
+                 min_seq_len=0):
         self.root_dir = root_dir
         self.wav_dir = os.path.join(root_dir, 'wavs')
         self.csv_dir = os.path.join(root_dir, csv_file)
@@ -60,11 +64,10 @@ def __len__(self):
         return len(self.frames)
 
     def __getitem__(self, idx):
-        wav_name = os.path.join(self.wav_dir,
-                                self.frames[idx][0]) + '.wav'
+        wav_name = os.path.join(self.wav_dir, self.frames[idx][0]) + '.wav'
         text = self.frames[idx][1]
-        text = np.asarray(text_to_sequence(
-            text, [self.cleaners]), dtype=np.int32)
+        text = np.asarray(
+            text_to_sequence(text, [self.cleaners]), dtype=np.int32)
         wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
         sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
         return sample
@@ -94,12 +97,13 @@ def collate_fn(self, batch):
             mel_lengths = [m.shape[1] + 1 for m in mel]  # +1 for zero-frame
 
             # compute 'stop token' targets
-            stop_targets = [np.array([0.]*(mel_len-1))
-                            for mel_len in mel_lengths]
+            stop_targets = [
+                np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths
+            ]
 
             # PAD stop targets
-            stop_targets = prepare_stop_target(
-                stop_targets, self.outputs_per_step)
+            stop_targets = prepare_stop_target(stop_targets,
+                                               self.outputs_per_step)
 
             # PAD sequences with largest length of the batch
             text = prepare_data(text).astype(np.int32)
@@ -123,8 +127,8 @@ def collate_fn(self, batch):
             mel_lengths = torch.LongTensor(mel_lengths)
             stop_targets = torch.FloatTensor(stop_targets)
 
-            return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0]
+            return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[
+                0]
 
         raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
-                         found {}"
-                         .format(type(batch[0]))))
+                         found {}".format(type(batch[0]))))
@@ -6,14 +6,18 @@
 from torch.utils.data import Dataset
 
 from utils.text import text_to_sequence
-from utils.data import (prepare_data, pad_per_step,
-                            prepare_tensor, prepare_stop_target)
+from utils.data import (prepare_data, pad_per_step, prepare_tensor,
+                        prepare_stop_target)
 
 
 class MyDataset(Dataset):
-
-    def __init__(self, root_dir, csv_file, outputs_per_step,
-                 text_cleaner, ap, min_seq_len=0):
+    def __init__(self,
+                 root_dir,
+                 csv_file,
+                 outputs_per_step,
+                 text_cleaner,
+                 ap,
+                 min_seq_len=0):
         self.root_dir = root_dir
         self.wav_dir = os.path.join(root_dir, 'wavs')
         self.feat_dir = os.path.join(root_dir, 'loader_data')
@@ -35,7 +39,7 @@ def load_wav(self, filename):
             return audio
         except RuntimeError as e:
             print(" !! Cannot read file : {}".format(filename))
-    
+
     def load_np(self, filename):
         data = np.load(filename).astype('float32')
         return data
@@ -66,20 +70,24 @@ def __len__(self):
 
     def __getitem__(self, idx):
         if self.items[idx] is None:
-            wav_name = os.path.join(self.wav_dir,
-                                    self.frames[idx][0]) + '.wav'
+            wav_name = os.path.join(self.wav_dir, self.frames[idx][0]) + '.wav'
             mel_name = os.path.join(self.feat_dir,
                                     self.frames[idx][0]) + '.mel.npy'
             linear_name = os.path.join(self.feat_dir,
                                        self.frames[idx][0]) + '.linear.npy'
             text = self.frames[idx][1]
-            text = np.asarray(text_to_sequence(
-                text, [self.cleaners]), dtype=np.int32)
+            text = np.asarray(
+                text_to_sequence(text, [self.cleaners]), dtype=np.int32)
             wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
             mel = self.load_np(mel_name)
             linear = self.load_np(linear_name)
-            sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0],
-                      'mel':mel, 'linear': linear}
+            sample = {
+                'text': text,
+                'wav': wav,
+                'item_idx': self.frames[idx][0],
+                'mel': mel,
+                'linear': linear
+            }
             self.items[idx] = sample
         else:
             sample = self.items[idx]
@@ -109,12 +117,13 @@ def collate_fn(self, batch):
             mel_lengths = [m.shape[1] + 1 for m in mel]  # +1 for zero-frame
 
             # compute 'stop token' targets
-            stop_targets = [np.array([0.]*(mel_len-1))
-                            for mel_len in mel_lengths]
+            stop_targets = [
+                np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths
+            ]
 
             # PAD stop targets
-            stop_targets = prepare_stop_target(
-                stop_targets, self.outputs_per_step)
+            stop_targets = prepare_stop_target(stop_targets,
+                                               self.outputs_per_step)
 
             # PAD sequences with largest length of the batch
             text = prepare_data(text).astype(np.int32)
@@ -138,8 +147,8 @@ def collate_fn(self, batch):
             mel_lengths = torch.LongTensor(mel_lengths)
             stop_targets = torch.FloatTensor(stop_targets)
 
-            return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0]
+            return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[
+                0]
 
         raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
-                         found {}"
-                         .format(type(batch[0]))))
+                         found {}".format(type(batch[0]))))
@@ -7,15 +7,25 @@
 
 from TTS.utils.text import text_to_sequence
 from TTS.utils.audio import AudioProcessor
-from TTS.utils.data import (prepare_data, pad_per_step,
-                            prepare_tensor, prepare_stop_target)
+from TTS.utils.data import (prepare_data, pad_per_step, prepare_tensor,
+                            prepare_stop_target)
 
 
 class TWEBDataset(Dataset):
-
-    def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate,
-                 text_cleaner, num_mels, min_level_db, frame_shift_ms,
-                 frame_length_ms, preemphasis, ref_level_db, num_freq, power,
+    def __init__(self,
+                 csv_file,
+                 root_dir,
+                 outputs_per_step,
+                 sample_rate,
+                 text_cleaner,
+                 num_mels,
+                 min_level_db,
+                 frame_shift_ms,
+                 frame_length_ms,
+                 preemphasis,
+                 ref_level_db,
+                 num_freq,
+                 power,
                  min_seq_len=0):
 
         with open(csv_file, "r") as f:
@@ -25,8 +35,9 @@ def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate,
         self.sample_rate = sample_rate
         self.cleaners = text_cleaner
         self.min_seq_len = min_seq_len
-        self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms,
-                                 frame_length_ms, preemphasis, ref_level_db, num_freq, power)
+        self.ap = AudioProcessor(sample_rate, num_mels, min_level_db,
+                                 frame_shift_ms, frame_length_ms, preemphasis,
+                                 ref_level_db, num_freq, power)
         print(" > Reading TWEB from - {}".format(root_dir))
         print(" | > Number of instances : {}".format(len(self.frames)))
         self._sort_frames()
@@ -63,11 +74,10 @@ def __len__(self):
         return len(self.frames)
 
     def __getitem__(self, idx):
-        wav_name = os.path.join(self.root_dir,
-                                self.frames[idx][0]) + '.wav'
+        wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav'
         text = self.frames[idx][1]
-        text = np.asarray(text_to_sequence(
-            text, [self.cleaners]), dtype=np.int32)
+        text = np.asarray(
+            text_to_sequence(text, [self.cleaners]), dtype=np.int32)
         wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
         sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
         return sample
@@ -97,12 +107,13 @@ def collate_fn(self, batch):
             mel_lengths = [m.shape[1] + 1 for m in mel]  # +1 for zero-frame
 
             # compute 'stop token' targets
-            stop_targets = [np.array([0.]*(mel_len-1))
-                            for mel_len in mel_lengths]
+            stop_targets = [
+                np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths
+            ]
 
             # PAD stop targets
-            stop_targets = prepare_stop_target(
-                stop_targets, self.outputs_per_step)
+            stop_targets = prepare_stop_target(stop_targets,
+                                               self.outputs_per_step)
 
             # PAD sequences with largest length of the batch
             text = prepare_data(text).astype(np.int32)
@@ -126,8 +137,8 @@ def collate_fn(self, batch):
             mel_lengths = torch.LongTensor(mel_lengths)
             stop_targets = torch.FloatTensor(stop_targets)
 
-            return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0]
+            return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[
+                0]
 
         raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
-                         found {}"
-                         .format(type(batch[0]))))
+                         found {}".format(type(batch[0]))))
@@ -10,7 +10,6 @@
     "hidden_size": 128,
     "embedding_size": 256,
     "text_cleaner": "english_cleaners",
-
     "epochs": 200,
     "lr": 0.01,
     "lr_patience": 2,
@@ -19,9 +18,7 @@
     "griffinf_lim_iters": 60,
     "power": 1.5,
     "r": 5,
-
     "num_loader_workers": 16,
-
     "save_step": 1,
     "data_path": "/data/shared/KeithIto/LJSpeech-1.0",
     "output_path": "result",