Added support for TTS-Portuguese Corpus

Edresson · Edresson · commit d3dddacb6198 · 2019-04-06T16:10:30.000-03:00
diff --git a/config.json b/config.json
@@ -1,13 +1,13 @@
 {
-    "model_name": "queue",
-    "model_description": "Queue memory and change lower r incrementatlly",
+    "model_name": "TTS-Portuguese",
+    "model_description": "Added support for the dataset TTS-Portuguese Corpus and adjusted hyperparametre for the convergence of the model",
 
     "audio":{
         "audio_processor": "audio",     // to use dictate different audio processors, if available.
         // Audio processing parameters
         "num_mels": 80,         // size of the mel spec frame. 
         "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 22050,   // wav sample-rate. If different than the original data, it is resampled.
+        "sample_rate": 48000,   // wav sample-rate. If different than the original data, it is resampled.
         "frame_length_ms": 50,  // stft window length in ms.
         "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
         "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
@@ -26,7 +26,7 @@
     },
 
     "embedding_size": 256,  // Character embedding vector length. You don't need to change it in general.
-    "text_cleaner": "phoneme_cleaners",
+    "text_cleaner": "phoneme_basic_cleaners",
     "epochs": 1000,         // total number of epochs to train.
     "lr": 0.0001,            // Initial learning rate. If Noam decay is active, maximum learning rate.
     "lr_decay": false,      // if true, Noam learning rate decaying is applied through training.
@@ -45,16 +45,16 @@
     "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
 
     "run_eval": true,
-    "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1",  // DATASET-RELATED: can overwritten from command argument
+    "data_path": "../TTS-Portuguese-Corpus/",  // DATASET-RELATED: can overwritten from command argument
     "meta_file_train": "metadata_train.csv",      // DATASET-RELATED: metafile for training dataloader.
     "meta_file_val": "metadata_val.csv",    // DATASET-RELATED: metafile for evaluation dataloader.
-    "dataset": "ljspeech",      // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
+    "dataset": "ttsportuguese",      // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
     "min_seq_len": 0,       // DATASET-RELATED: minimum text length to use in training
-    "max_seq_len": 300,     // DATASET-RELATED: maximum text length
-    "output_path": "/media/erogol/data_ssd/Data/models/ljspeech_models/",      // DATASET-RELATED: output path for all training outputs.
+    "max_seq_len": 450,     // DATASET-RELATED: maximum text length
+    "output_path": "../TTS-Mozilla/Model/tts-portuguese_models/",      // DATASET-RELATED: output path for all training outputs.
     "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
     "num_val_loader_workers": 4,    // number of evaluation data loader processes.
-    "phoneme_cache_path": "ljspeech_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "phoneme_cache_path": "tts-portuguese_pt-br_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
     "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
-    "phoneme_language": "en-us"     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+    "phoneme_language": "pt-br"     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
 }
diff --git a/datasets/preprocess.py b/datasets/preprocess.py
@@ -65,7 +65,7 @@ def mailabs(root_path, meta_files):
 
 
 def ljspeech(root_path, meta_file):
-    """Normalizes the Nancy meta data file to TTS format"""
+    """Normalizes the LJSpeech meta data file to TTS format"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
     with open(txt_file, 'r') as ttf:
@@ -77,6 +77,22 @@ def ljspeech(root_path, meta_file):
     random.shuffle(items)
     return items
 
+def ttsportuguese(root_path, meta_file):
+    """Normalizes the TTS-Portuguese Corpus meta data file to TTS format"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    with open(txt_file, 'r') as ttf:
+        for line in ttf:
+            cols = line.split('==')
+            file_name=os.path.basename(cols[0])
+            if int(file_name.split('-')[1].replace('.wav','')) >= 5655 and int(file_name.split('-')[1].replace('.wav',''))<=5674:
+                print('ignored file:',file_name, 'because this file is used for test (phonetically balanced sentence)')
+                continue
+            wav_file = os.path.join(root_path,cols[0])
+            text = cols[1]
+            items.append([text, wav_file])
+    random.shuffle(items)
+    return items
 
 def nancy(root_path, meta_file):
     """Normalizes the Nancy meta data file to TTS format"""
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
@@ -30,12 +30,9 @@ def text2phone(text, language):
     ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
     # Replace \n with matching punctuations.
     if len(punctuations) > 0:
-        for punct in punctuations[:-1]:
-             ph = ph.replace('| |\n', '|'+punct+'| |', 1)
-        try:
-             ph = ph[:-1] + punctuations[-1]
-        except:
-             print(text)
+        #print(text,'--->',ph)
+        for punct in punctuations:
+            ph = ph.replace('| |\n', '|'+punct+'| |', 1)
     return ph
 
 
@@ -46,14 +43,14 @@ def phoneme_to_sequence(text, cleaner_names, language):
     sequence = []
     clean_text = _clean_text(text, cleaner_names)
     phonemes = text2phone(clean_text, language)
-#     print(phonemes.replace('|', ''))
+#    print(phonemes.replace('|', ''))
     if phonemes is None:
         print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
     for phoneme in phonemes.split('|'):
-        # print(word, ' -- ', phonemes_text)
         sequence += _phoneme_to_sequence(phoneme)
-    # Aeepnd EOS char
-    sequence.append(_phonemes_to_id['~'])
+    #print(clean_text, ' -- ', phonemes.replace('|', ''))
+    # Append EOS char
+    sequence.append(_phonemes_to_id['&'])
     return sequence
 
 
@@ -94,7 +91,7 @@ def text_to_sequence(text, cleaner_names):
         text = m.group(3)
 
     # Append EOS token
-    sequence.append(_symbol_to_id['~'])
+    sequence.append(_symbol_to_id['&'])
     return sequence
 
 
@@ -133,8 +130,8 @@ def _arpabet_to_sequence(text):
 
 
 def _should_keep_symbol(s):
-    return s in _symbol_to_id and s is not '_' and s is not '~'
+    return s in _symbol_to_id and s is not '_' and s is not '&'
 
 
 def _should_keep_phoneme(p):
-    return p in _phonemes_to_id and p is not '_' and p is not '~'
+    return p in _phonemes_to_id and p is not '_' and p is not '&'
diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py
@@ -95,3 +95,8 @@ def phoneme_cleaners(text):
     text = expand_abbreviations(text)
     text = collapse_whitespace(text)
     return text
+
+def phoneme_basic_cleaners(text):
+    '''Basic pipeline for phonemes mode, including collapses whitespace without transliteration.'''
+    text = collapse_whitespace(text)
+    return text
diff --git a/utils/text/symbols.py b/utils/text/symbols.py
@@ -8,15 +8,21 @@
 from utils.text import cmudict
 
 _pad = '_'
-_eos = '~'
-_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
-_punctuations = '!\'(),-.:;? '
-_phoneme_punctuations = '.!;:,?'
-
-# TODO: include more phoneme characters for other languages.
-_phonemes = ['l','ɹ','ɜ','ɚ','k','u','ʔ','ð','ɐ','ɾ','ɑ','ɔ','b','ɛ','t','v','n','m','ʊ','ŋ','s',
-             'ʌ','o','ʃ','i','p','æ','e','a','ʒ',' ','h','ɪ','ɡ','f','r','w','ɫ','ɬ','d','x','ː',
-             'ᵻ','ə','j','θ','z','ɒ']
+_eos = '&'
+_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÇÃÀÁÂÊÉÍÓÔÕÚÛabcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû!(),-.:;? '
+_punctuations = '!(),-.:;? '
+_phoneme_punctuations = '.!;:,? '
+
+# Phonemes definition
+_vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ'
+_non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ'
+_pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ'
+_suprasegmentals = 'ˈˌːˑ'
+_other_symbols = "ʍwɥʜʢʡɕʑɺɧ'̃' "
+_diacrilics = 'ɚ˞ɫ'
+_phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics))
+
+
 
 _phonemes = sorted(list(set(_phonemes)))