Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bug fix] in Downloading dataset #21

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions datasets/esd_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Data loader for the LJSpeech dataset. See: https://keithito.com/LJ-Speech-Dataset/"""
import os
import re
import codecs
import unicodedata
import numpy as np
import pandas as pd

from torch.utils.data import Dataset

vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding, E: EOS.
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}


def text_normalize(text):
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents

text = text.lower()
text = re.sub("[^{}]".format(vocab), " ", text)
text = re.sub("[ ]+", " ", text)
return text


def read_metadata(csv_file, subset='*'):
df = pd.read_csv(csv_file)
if subset == '*':
subset_df = df
else:
subset_df = df[df['subset'] == subset]
fnames, text_lengths, texts = [], [], []

for index, row in subset_df.iterrows():
fnames.append(row['file'])
text = row['text']

text = text_normalize(text) + "E" # E: EOS
text = [char2idx[char] for char in text]
text_lengths.append(len(text))
texts.append(np.array(text, np.long))

return fnames, text_lengths, texts


def get_test_data(sentences, max_n):
normalized_sentences = [text_normalize(line).strip() + "E" for line in sentences] # text normalization, E: EOS
texts = np.zeros((len(normalized_sentences), max_n + 1), np.long)
for i, sent in enumerate(normalized_sentences):
texts[i, :len(sent)] = [char2idx[char] for char in sent]
return texts


class ESDSpeech(Dataset):
def __init__(self, keys, dir_name='EmotionalSpeechDataset'):
self.keys = keys
self.path = os.path.join(os.path.dirname(os.path.realpath(__file__)), dir_name)
self.fnames, self.text_lengths, self.texts = read_metadata(os.path.join(self.path, 'ESD.csv'))

def slice(self, start, end):
self.fnames = self.fnames[start:end]
self.text_lengths = self.text_lengths[start:end]
self.texts = self.texts[start:end]

def __len__(self):
return len(self.fnames)

def __getitem__(self, index):
data = {}
if 'texts' in self.keys:
data['texts'] = self.texts[index]
if 'mels' in self.keys:
# (39, 80)
data['mels'] = np.load(os.path.join(self.path, 'mels', "%s.npy" % self.fnames[index]))
if 'mags' in self.keys:
# (39, 80)
data['mags'] = np.load(os.path.join(self.path, 'mags', "%s.npy" % self.fnames[index]))
if 'mel_gates' in self.keys:
data['mel_gates'] = np.ones(data['mels'].shape[0], dtype=np.int) # TODO: because pre processing!
if 'mag_gates' in self.keys:
data['mag_gates'] = np.ones(data['mags'].shape[0], dtype=np.int) # TODO: because pre processing!
return data
39 changes: 28 additions & 11 deletions dl_and_preprop_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,34 @@
"""
__author__ = 'Erdene-Ochir Tuguldur'

import os
import sys
import argparse
import csv
import sys
import time
import argparse
from zipfile import ZipFile

import fnmatch
import librosa
import os
import pandas as pd

from hparams import HParams as hp
from zipfile import ZipFile
from audio import preprocess
from utils import download_file
from datasets.mb_speech import MBSpeech
from datasets.esd_speech import ESDSpeech
from datasets.lj_speech import LJSpeech
from datasets.mb_speech import MBSpeech
from hparams import HParams as hp
from utils import download_file

parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech'], help='dataset name')
parser.add_argument("--dataset", required=True, choices=['ljspeech', 'esdspeech', 'mbspeech'], help='dataset name')
args = parser.parse_args()

if args.dataset == 'ljspeech':
dataset_file_name = 'LJSpeech-1.1.tar.bz2'
datasets_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'datasets')
dataset_path = os.path.join(datasets_path, 'LJSpeech-1.1')

if os.path.isdir(dataset_path) and False:
if os.path.isdir(dataset_path):
print("LJSpeech dataset folder already exists")
sys.exit(0)
else:
Expand All @@ -48,12 +50,27 @@
print("pre processing...")
lj_speech = LJSpeech([])
preprocess(dataset_path, lj_speech)
elif args.dataset == 'esdspeech':
datasets_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'datasets')
dataset_path = os.path.join(datasets_path, 'EmotionalSpeechDataset')

if os.path.isdir(dataset_path):
print("EmotionalSpeechDataset dataset folder already exists")
else:
print("Please Download the ESD Dataset")

print("pre processing...")
esd_speech = ESDSpeech([])
preprocess(dataset_path, esd_speech)



elif args.dataset == 'mbspeech':
dataset_name = 'MBSpeech-1.0'
datasets_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'datasets')
dataset_path = os.path.join(datasets_path, dataset_name)

if os.path.isdir(dataset_path) and False:
if os.path.isdir(dataset_path):
print("MBSpeech dataset folder already exists")
sys.exit(0)
else:
Expand Down Expand Up @@ -159,4 +176,4 @@ def _convert_mp3_to_wav(book_name, book_nr):
# pre process
print("pre processing...")
mb_speech = MBSpeech([])
preprocess(dataset_path, mb_speech)
preprocess(dataset_path, mb_speech)
28 changes: 27 additions & 1 deletion synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,38 @@
from utils import get_last_checkpoint_file_name, load_checkpoint, save_to_png

parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech'], help='dataset name')
parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech', 'esdspeech'], help='dataset name')
args = parser.parse_args()

if args.dataset == 'ljspeech':
from datasets.lj_speech import vocab, get_test_data

SENTENCES = [
"The birch canoe slid on the smooth planks.",
"Glue the sheet to the dark blue background.",
"It's easy to tell the depth of a well.",
"These days a chicken leg is a rare dish.",
"Rice is often served in round bowls.",
"The juice of lemons makes fine punch.",
"The box was thrown beside the parked truck.",
"The hogs were fed chopped corn and garbage.",
"Four hours of steady work faced us.",
"Large size in stockings is hard to sell.",
"The boy was there when the sun rose.",
"A rod is used to catch pink salmon.",
"The source of the huge river is the clear spring.",
"Kick the ball straight and follow through.",
"Help the woman get back to her feet.",
"A pot of tea helps to pass the evening.",
"Smoky fires lack flame and heat.",
"The soft cushion broke the man's fall.",
"The salt breeze came across from the sea.",
"The girl at the booth sold fifty bonds."
]

elif args.dataset == 'esdspeech':
from datasets.esd_speech import vocab, get_test_data

SENTENCES = [
"The birch canoe slid on the smooth planks.",
"Glue the sheet to the dark blue background.",
Expand Down
13 changes: 9 additions & 4 deletions train-ssrn.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,28 @@
from datasets.data_loader import SSRNDataLoader

parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech'], help='dataset name')
parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech', 'esdspeech'], help='dataset name')
args = parser.parse_args()

if args.dataset == 'ljspeech':
from datasets.lj_speech import LJSpeech as SpeechDataset
elif args.dataset == 'esdspeech':
from datasets.esd_speech import ESDSpeech as SpeechDataset
else:
from datasets.mb_speech import MBSpeech as SpeechDataset

use_gpu = torch.cuda.is_available()
print('use_gpu', use_gpu)

device = 'cpu'
if use_gpu:
device = 'cuda:0'
torch.backends.cudnn.benchmark = True

train_data_loader = SSRNDataLoader(ssrn_dataset=SpeechDataset(['mags', 'mels']), batch_size=24, mode='train')
valid_data_loader = SSRNDataLoader(ssrn_dataset=SpeechDataset(['mags', 'mels']), batch_size=24, mode='valid')

ssrn = SSRN().cuda()
ssrn = SSRN().to(device)

optimizer = torch.optim.Adam(ssrn.parameters(), lr=hp.ssrn_lr)

Expand Down Expand Up @@ -81,8 +86,8 @@ def train(train_epoch, phase='train'):
S = S.permute(0, 2, 1) # TODO: because of pre processing

M.requires_grad = False
M = M.cuda()
S = S.cuda()
M = M.to(device)
S = S.to(device)

Z_logit, Z = ssrn(S)

Expand Down
21 changes: 14 additions & 7 deletions train-text2mel.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,31 @@
from datasets.data_loader import Text2MelDataLoader

parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech'], help='dataset name')
parser.add_argument("--dataset", required=True, choices=['ljspeech', 'mbspeech', 'esdspeech'], help='dataset name')
args = parser.parse_args()

if args.dataset == 'ljspeech':
from datasets.lj_speech import vocab, LJSpeech as SpeechDataset
elif args.dataset == 'esdspeech':
from datasets.esd_speech import vocab, ESDSpeech as SpeechDataset
else:
from datasets.mb_speech import vocab, MBSpeech as SpeechDataset

use_gpu = torch.cuda.is_available()
# use_gpu = False
print('use_gpu', use_gpu)

device = 'cpu'
if use_gpu:
device = 'cuda:1'
torch.backends.cudnn.benchmark = True

train_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(['texts', 'mels', 'mel_gates']), batch_size=64,
mode='train')
valid_data_loader = Text2MelDataLoader(text2mel_dataset=SpeechDataset(['texts', 'mels', 'mel_gates']), batch_size=64,
mode='valid')

text2mel = Text2Mel(vocab).cuda()
text2mel = Text2Mel(vocab).to(device)

optimizer = torch.optim.Adam(text2mel.parameters(), lr=hp.text2mel_lr)

Expand Down Expand Up @@ -102,11 +108,12 @@ def W_nt(_, n, t, g=0.2):
W = np.fromfunction(W_nt, (B, N, T), dtype=np.float32)
W = torch.from_numpy(W)

L = L.cuda()
S = S.cuda()
S_shifted = S_shifted.cuda()
W = W.cuda()
gates = gates.cuda()
L = L.to(device)
S = S.to(device)
S_shifted = S_shifted.to(device)
W = W.to(device)
gates = gates.to(device)
gates = gates.to(device)

Y_logit, Y, A = text2mel(L, S)

Expand Down