|
1 | 1 | {
|
2 |
| - "model_name": "queue", |
3 |
| - "model_description": "Queue memory and change lower r incrementatlly", |
| 2 | + "model_name": "TTS-Portuguese", |
| 3 | + "model_description": "Added support for the dataset TTS-Portuguese Corpus and adjusted hyperparametre for the convergence of the model", |
4 | 4 |
|
5 | 5 | "audio":{
|
6 | 6 | "audio_processor": "audio", // to use dictate different audio processors, if available.
|
7 | 7 | // Audio processing parameters
|
8 | 8 | "num_mels": 80, // size of the mel spec frame.
|
9 | 9 | "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
|
10 |
| - "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. |
| 10 | + "sample_rate": 48000, // wav sample-rate. If different than the original data, it is resampled. |
11 | 11 | "frame_length_ms": 50, // stft window length in ms.
|
12 | 12 | "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
|
13 | 13 | "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
|
26 | 26 | },
|
27 | 27 |
|
28 | 28 | "embedding_size": 256, // Character embedding vector length. You don't need to change it in general.
|
29 |
| - "text_cleaner": "phoneme_cleaners", |
| 29 | + "text_cleaner": "phoneme_basic_cleaners", |
30 | 30 | "epochs": 1000, // total number of epochs to train.
|
31 | 31 | "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
32 | 32 | "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
|
|
45 | 45 | "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
46 | 46 |
|
47 | 47 | "run_eval": true,
|
48 |
| - "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument |
| 48 | + "data_path": "../TTS-Portuguese-Corpus/", // DATASET-RELATED: can overwritten from command argument |
49 | 49 | "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
|
50 | 50 | "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
|
51 |
| - "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py |
| 51 | + "dataset": "ttsportuguese", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py |
52 | 52 | "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
|
53 |
| - "max_seq_len": 300, // DATASET-RELATED: maximum text length |
54 |
| - "output_path": "/media/erogol/data_ssd/Data/models/ljspeech_models/", // DATASET-RELATED: output path for all training outputs. |
| 53 | + "max_seq_len": 450, // DATASET-RELATED: maximum text length |
| 54 | + "output_path": "../TTS-Mozilla/Model/tts-portuguese_models/", // DATASET-RELATED: output path for all training outputs. |
55 | 55 | "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
56 | 56 | "num_val_loader_workers": 4, // number of evaluation data loader processes.
|
57 |
| - "phoneme_cache_path": "ljspeech_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. |
| 57 | + "phoneme_cache_path": "tts-portuguese_pt-br_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. |
58 | 58 | "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
59 |
| - "phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages |
| 59 | + "phoneme_language": "pt-br" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages |
60 | 60 | }
|
0 commit comments