Skip to content

Commit d3dddac

Browse files
committed
Added support for TTS-Portuguese Corpus
1 parent db7f3d3 commit d3dddac

File tree

5 files changed

+57
-33
lines changed

5 files changed

+57
-33
lines changed

config.json

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
{
2-
"model_name": "queue",
3-
"model_description": "Queue memory and change lower r incrementatlly",
2+
"model_name": "TTS-Portuguese",
3+
"model_description": "Added support for the dataset TTS-Portuguese Corpus and adjusted hyperparametre for the convergence of the model",
44

55
"audio":{
66
"audio_processor": "audio", // to use dictate different audio processors, if available.
77
// Audio processing parameters
88
"num_mels": 80, // size of the mel spec frame.
99
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
10-
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
10+
"sample_rate": 48000, // wav sample-rate. If different than the original data, it is resampled.
1111
"frame_length_ms": 50, // stft window length in ms.
1212
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
1313
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
@@ -26,7 +26,7 @@
2626
},
2727

2828
"embedding_size": 256, // Character embedding vector length. You don't need to change it in general.
29-
"text_cleaner": "phoneme_cleaners",
29+
"text_cleaner": "phoneme_basic_cleaners",
3030
"epochs": 1000, // total number of epochs to train.
3131
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
3232
"lr_decay": false, // if true, Noam learning rate decaying is applied through training.
@@ -45,16 +45,16 @@
4545
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
4646

4747
"run_eval": true,
48-
"data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument
48+
"data_path": "../TTS-Portuguese-Corpus/", // DATASET-RELATED: can overwritten from command argument
4949
"meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
5050
"meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
51-
"dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
51+
"dataset": "ttsportuguese", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
5252
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
53-
"max_seq_len": 300, // DATASET-RELATED: maximum text length
54-
"output_path": "/media/erogol/data_ssd/Data/models/ljspeech_models/", // DATASET-RELATED: output path for all training outputs.
53+
"max_seq_len": 450, // DATASET-RELATED: maximum text length
54+
"output_path": "../TTS-Mozilla/Model/tts-portuguese_models/", // DATASET-RELATED: output path for all training outputs.
5555
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
5656
"num_val_loader_workers": 4, // number of evaluation data loader processes.
57-
"phoneme_cache_path": "ljspeech_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
57+
"phoneme_cache_path": "tts-portuguese_pt-br_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
5858
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
59-
"phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
59+
"phoneme_language": "pt-br" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
6060
}

datasets/preprocess.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def mailabs(root_path, meta_files):
6565

6666

6767
def ljspeech(root_path, meta_file):
68-
"""Normalizes the Nancy meta data file to TTS format"""
68+
"""Normalizes the LJSpeech meta data file to TTS format"""
6969
txt_file = os.path.join(root_path, meta_file)
7070
items = []
7171
with open(txt_file, 'r') as ttf:
@@ -77,6 +77,22 @@ def ljspeech(root_path, meta_file):
7777
random.shuffle(items)
7878
return items
7979

80+
def ttsportuguese(root_path, meta_file):
81+
"""Normalizes the TTS-Portuguese Corpus meta data file to TTS format"""
82+
txt_file = os.path.join(root_path, meta_file)
83+
items = []
84+
with open(txt_file, 'r') as ttf:
85+
for line in ttf:
86+
cols = line.split('==')
87+
file_name=os.path.basename(cols[0])
88+
if int(file_name.split('-')[1].replace('.wav','')) >= 5655 and int(file_name.split('-')[1].replace('.wav',''))<=5674:
89+
print('ignored file:',file_name, 'because this file is used for test (phonetically balanced sentence)')
90+
continue
91+
wav_file = os.path.join(root_path,cols[0])
92+
text = cols[1]
93+
items.append([text, wav_file])
94+
random.shuffle(items)
95+
return items
8096

8197
def nancy(root_path, meta_file):
8298
"""Normalizes the Nancy meta data file to TTS format"""

utils/text/__init__.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,9 @@ def text2phone(text, language):
3030
ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
3131
# Replace \n with matching punctuations.
3232
if len(punctuations) > 0:
33-
for punct in punctuations[:-1]:
34-
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
35-
try:
36-
ph = ph[:-1] + punctuations[-1]
37-
except:
38-
print(text)
33+
#print(text,'--->',ph)
34+
for punct in punctuations:
35+
ph = ph.replace('| |\n', '|'+punct+'| |', 1)
3936
return ph
4037

4138

@@ -46,14 +43,14 @@ def phoneme_to_sequence(text, cleaner_names, language):
4643
sequence = []
4744
clean_text = _clean_text(text, cleaner_names)
4845
phonemes = text2phone(clean_text, language)
49-
# print(phonemes.replace('|', ''))
46+
# print(phonemes.replace('|', ''))
5047
if phonemes is None:
5148
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
5249
for phoneme in phonemes.split('|'):
53-
# print(word, ' -- ', phonemes_text)
5450
sequence += _phoneme_to_sequence(phoneme)
55-
# Aeepnd EOS char
56-
sequence.append(_phonemes_to_id['~'])
51+
#print(clean_text, ' -- ', phonemes.replace('|', ''))
52+
# Append EOS char
53+
sequence.append(_phonemes_to_id['&'])
5754
return sequence
5855

5956

@@ -94,7 +91,7 @@ def text_to_sequence(text, cleaner_names):
9491
text = m.group(3)
9592

9693
# Append EOS token
97-
sequence.append(_symbol_to_id['~'])
94+
sequence.append(_symbol_to_id['&'])
9895
return sequence
9996

10097

@@ -133,8 +130,8 @@ def _arpabet_to_sequence(text):
133130

134131

135132
def _should_keep_symbol(s):
136-
return s in _symbol_to_id and s is not '_' and s is not '~'
133+
return s in _symbol_to_id and s is not '_' and s is not '&'
137134

138135

139136
def _should_keep_phoneme(p):
140-
return p in _phonemes_to_id and p is not '_' and p is not '~'
137+
return p in _phonemes_to_id and p is not '_' and p is not '&'

utils/text/cleaners.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,8 @@ def phoneme_cleaners(text):
9595
text = expand_abbreviations(text)
9696
text = collapse_whitespace(text)
9797
return text
98+
99+
def phoneme_basic_cleaners(text):
100+
'''Basic pipeline for phonemes mode, including collapses whitespace without transliteration.'''
101+
text = collapse_whitespace(text)
102+
return text

utils/text/symbols.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,21 @@
88
from utils.text import cmudict
99

1010
_pad = '_'
11-
_eos = '~'
12-
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
13-
_punctuations = '!\'(),-.:;? '
14-
_phoneme_punctuations = '.!;:,?'
15-
16-
# TODO: include more phoneme characters for other languages.
17-
_phonemes = ['l','ɹ','ɜ','ɚ','k','u','ʔ','ð','ɐ','ɾ','ɑ','ɔ','b','ɛ','t','v','n','m','ʊ','ŋ','s',
18-
'ʌ','o','ʃ','i','p','æ','e','a','ʒ',' ','h','ɪ','ɡ','f','r','w','ɫ','ɬ','d','x','ː',
19-
'ᵻ','ə','j','θ','z','ɒ']
11+
_eos = '&'
12+
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? '
13+
_punctuations = '!(),-.:;? '
14+
_phoneme_punctuations = '.!;:,? '
15+
16+
# Phonemes definition
17+
_vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ'
18+
_non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ'
19+
_pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ'
20+
_suprasegmentals = 'ˈˌːˑ'
21+
_other_symbols = "ʍwɥʜʢʡɕʑɺɧ'̃' "
22+
_diacrilics = 'ɚ˞ɫ'
23+
_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics))
24+
25+
2026

2127
_phonemes = sorted(list(set(_phonemes)))
2228

0 commit comments

Comments
 (0)