tugstugi · jayaneetha · Jul 17, 2021 · Jul 17, 2021 · Sep 12, 2021 · Sep 12, 2021
diff --git a/datasets/esd_speech.py b/datasets/esd_speech.py
@@ -0,0 +1,82 @@
+"""Data loader for the LJSpeech dataset. See: https://keithito.com/LJ-Speech-Dataset/"""
+import os
+import re
+import codecs
+import unicodedata
+import numpy as np
+import pandas as pd
+
+from torch.utils.data import Dataset
+
+vocab = "PE abcdefghijklmnopqrstuvwxyz'.?"  # P: Padding, E: EOS.
+char2idx = {char: idx for idx, char in enumerate(vocab)}
+idx2char = {idx: char for idx, char in enumerate(vocab)}
+
+
+def text_normalize(text):
+    text = ''.join(char for char in unicodedata.normalize('NFD', text)
+                   if unicodedata.category(char) != 'Mn')  # Strip accents
+
+    text = text.lower()
+    text = re.sub("[^{}]".format(vocab), " ", text)
+    text = re.sub("[ ]+", " ", text)
+    return text
+
+
+def read_metadata(csv_file, subset='*'):
+    df = pd.read_csv(csv_file)
+    if subset == '*':
+        subset_df = df
+    else:
+        subset_df = df[df['subset'] == subset]
+    fnames, text_lengths, texts = [], [], []
+
+    for index, row in subset_df.iterrows():
+        fnames.append(row['file'])
+        text = row['text']
+
+        text = text_normalize(text) + "E"  # E: EOS
+        text = [char2idx[char] for char in text]
+        text_lengths.append(len(text))
+        texts.append(np.array(text, np.long))
+
+    return fnames, text_lengths, texts
+
+
+def get_test_data(sentences, max_n):
+    normalized_sentences = [text_normalize(line).strip() + "E" for line in sentences]  # text normalization, E: EOS
+    texts = np.zeros((len(normalized_sentences), max_n + 1), np.long)
+    for i, sent in enumerate(normalized_sentences):
+        texts[i, :len(sent)] = [char2idx[char] for char in sent]
+    return texts
+
+
+class ESDSpeech(Dataset):
+    def __init__(self, keys, dir_name='EmotionalSpeechDataset'):
+        self.keys = keys
+        self.path = os.path.join(os.path.dirname(os.path.realpath(__file__)), dir_name)
+        self.fnames, self.text_lengths, self.texts = read_metadata(os.path.join(self.path, 'ESD.csv'))
+
+    def slice(self, start, end):
+        self.fnames = self.fnames[start:end]
+        self.text_lengths = self.text_lengths[start:end]
+        self.texts = self.texts[start:end]
+
+    def __len__(self):
+        return len(self.fnames)
+
+    def __getitem__(self, index):
+        data = {}
+        if 'texts' in self.keys:
+            data['texts'] = self.texts[index]
+        if 'mels' in self.keys:
+            # (39, 80)
+            data['mels'] = np.load(os.path.join(self.path, 'mels', "%s.npy" % self.fnames[index]))
+        if 'mags' in self.keys:
+            # (39, 80)
+            data['mags'] = np.load(os.path.join(self.path, 'mags', "%s.npy" % self.fnames[index]))
+        if 'mel_gates' in self.keys:
+            data['mel_gates'] = np.ones(data['mels'].shape[0], dtype=np.int)  # TODO: because pre processing!
+        if 'mag_gates' in self.keys:
+            data['mag_gates'] = np.ones(data['mags'].shape[0], dtype=np.int)  # TODO: because pre processing!
+        return data
diff --git a/dl_and_preprop_dataset.py b/dl_and_preprop_dataset.py
@@ -5,32 +5,34 @@
 """
 __author__ = 'Erdene-Ochir Tuguldur'
 
-import os
-import sys
+import argparse
 import csv
+import sys
 import time
-import argparse
+from zipfile import ZipFile
+
 import fnmatch
 import librosa
+import os
 import pandas as pd
 
-from hparams import HParams as hp
-from zipfile import ZipFile
 from audio import preprocess
-from utils import download_file
-from datasets.mb_speech import MBSpeech
+from datasets.esd_speech import ESDSpeech
 from datasets.lj_speech import LJSpeech
+from datasets.mb_speech import MBSpeech
+from hparams import HParams as hp
+from utils import download_file
 
 parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech'], help='dataset name')
+parser.add_argument("--dataset", required=True, choices=['ljspeech', 'esdspeech', 'mbspeech'], help='dataset name')
 args = parser.parse_args()
 
 if args.dataset == 'ljspeech':
     dataset_file_name = 'LJSpeech-1.1.tar.bz2'
     datasets_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'datasets')
     dataset_path = os.path.join(datasets_path, 'LJSpeech-1.1')
 
-    if os.path.isdir(dataset_path) and False:
+    if os.path.isdir(dataset_path):
         print("LJSpeech dataset folder already exists")
         sys.exit(0)
     else:
@@ -48,12 +50,27 @@
         print("pre processing...")
         lj_speech = LJSpeech([])
         preprocess(dataset_path, lj_speech)
+elif args.dataset == 'esdspeech':
+    datasets_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'datasets')
+    dataset_path = os.path.join(datasets_path, 'EmotionalSpeechDataset')
+
+    if os.path.isdir(dataset_path):
+        print("EmotionalSpeechDataset dataset folder already exists")
+    else:
+        print("Please Download the ESD Dataset")
+
+    print("pre processing...")
+    esd_speech = ESDSpeech([])
+    preprocess(dataset_path, esd_speech)
+
+
+
 elif args.dataset == 'mbspeech':
     dataset_name = 'MBSpeech-1.0'
     datasets_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'datasets')
     dataset_path = os.path.join(datasets_path, dataset_name)
 
-    if os.path.isdir(dataset_path) and False:
+    if os.path.isdir(dataset_path):
         print("MBSpeech dataset folder already exists")
         sys.exit(0)
     else:
@@ -159,4 +176,4 @@ def _convert_mp3_to_wav(book_name, book_nr):
     # pre process
     print("pre processing...")
     mb_speech = MBSpeech([])
-    preprocess(dataset_path, mb_speech)
+    preprocess(dataset_path, mb_speech)
diff --git a/synthesize.py b/synthesize.py
@@ -16,12 +16,38 @@
 from utils import get_last_checkpoint_file_name, load_checkpoint, save_to_png
 
 parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech'], help='dataset name')
+parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech', 'esdspeech'], help='dataset name')
 args = parser.parse_args()
 
 if args.dataset == 'ljspeech':
     from datasets.lj_speech import vocab, get_test_data
 
+    SENTENCES = [
+        "The birch canoe slid on the smooth planks.",
+        "Glue the sheet to the dark blue background.",
+        "It's easy to tell the depth of a well.",
+        "These days a chicken leg is a rare dish.",
+        "Rice is often served in round bowls.",
+        "The juice of lemons makes fine punch.",
+        "The box was thrown beside the parked truck.",
+        "The hogs were fed chopped corn and garbage.",
+        "Four hours of steady work faced us.",
+        "Large size in stockings is hard to sell.",
+        "The boy was there when the sun rose.",
+        "A rod is used to catch pink salmon.",
+        "The source of the huge river is the clear spring.",
+        "Kick the ball straight and follow through.",
+        "Help the woman get back to her feet.",
+        "A pot of tea helps to pass the evening.",
+        "Smoky fires lack flame and heat.",
+        "The soft cushion broke the man's fall.",
+        "The salt breeze came across from the sea.",
+        "The girl at the booth sold fifty bonds."
+    ]
+
+elif args.dataset == 'esdspeech':
+    from datasets.esd_speech import vocab, get_test_data
+
     SENTENCES = [
         "The birch canoe slid on the smooth planks.",
         "Glue the sheet to the dark blue background.",

diff --git a/train-ssrn.py b/train-ssrn.py
@@ -18,23 +18,28 @@
 from datasets.data_loader import SSRNDataLoader
 
 parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech'], help='dataset name')
+parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech', 'esdspeech'], help='dataset name')
 args = parser.parse_args()
 
 if args.dataset == 'ljspeech':
     from datasets.lj_speech import LJSpeech as SpeechDataset
+elif args.dataset == 'esdspeech':
+    from datasets.esd_speech import ESDSpeech as SpeechDataset
 else:
     from datasets.mb_speech import MBSpeech as SpeechDataset
 
 use_gpu = torch.cuda.is_available()
 print('use_gpu', use_gpu)
+
+device = 'cpu'
 if use_gpu:
+    device = 'cuda:0'
     torch.backends.cudnn.benchmark = True
 
 train_data_loader = SSRNDataLoader(ssrn_dataset=SpeechDataset(['mags', 'mels']), batch_size=24, mode='train')
 valid_data_loader = SSRNDataLoader(ssrn_dataset=SpeechDataset(['mags', 'mels']), batch_size=24, mode='valid')
 
-ssrn = SSRN().cuda()
+ssrn = SSRN().to(device)
 
 optimizer = torch.optim.Adam(ssrn.parameters(), lr=hp.ssrn_lr)
 
@@ -81,8 +86,8 @@ def train(train_epoch, phase='train'):
         S = S.permute(0, 2, 1)  # TODO: because of pre processing
 
         M.requires_grad = False
-        M = M.cuda()
-        S = S.cuda()
+        M = M.to(device)
+        S = S.to(device)
 
         Z_logit, Z = ssrn(S)
 

diff --git a/train-text2mel.py b/train-text2mel.py
@@ -20,25 +20,31 @@
 from datasets.data_loader import Text2MelDataLoader
 
 parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech'], help='dataset name')
+parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech', 'esdspeech'], help='dataset name')
 args = parser.parse_args()
 
 if args.dataset == 'ljspeech':
     from datasets.lj_speech import vocab, LJSpeech as SpeechDataset
+elif args.dataset == 'esdspeech':
+    from datasets.esd_speech import vocab, ESDSpeech as SpeechDataset
 else:
     from datasets.mb_speech import vocab, MBSpeech as SpeechDataset
 
 use_gpu = torch.cuda.is_available()
+# use_gpu = False
 print('use_gpu', use_gpu)
+
+device = 'cpu'
 if use_gpu:
+    device = 'cuda:1'
     torch.backends.cudnn.benchmark = True
 
 train_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(['texts', 'mels', 'mel_gates']), batch_size=64,
                                        mode='train')
 valid_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(['texts', 'mels', 'mel_gates']), batch_size=64,
                                        mode='valid')
 
-text2mel = Text2Mel(vocab).cuda()
+text2mel = Text2Mel(vocab).to(device)
 
 optimizer = torch.optim.Adam(text2mel.parameters(), lr=hp.text2mel_lr)
 
@@ -102,11 +108,12 @@ def W_nt(_, n, t, g=0.2):
         W = np.fromfunction(W_nt, (B, N, T), dtype=np.float32)
         W = torch.from_numpy(W)
 
-        L = L.cuda()
-        S = S.cuda()
-        S_shifted = S_shifted.cuda()
-        W = W.cuda()
-        gates = gates.cuda()
+        L = L.to(device)
+        S = S.to(device)
+        S_shifted = S_shifted.to(device)
+        W = W.to(device)
+        gates = gates.to(device)
+        gates = gates.to(device)
 
         Y_logit, Y, A = text2mel(L, S)