name: Tacotron2 sample_rate: 22050 # , , will be added by the tacotron2.py script labels: [' ', '!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] n_fft: 1024 n_mels: 80 fmax: 8000 n_stride: 256 pad_value: -11.52 train_dataset: ??? validation_datasets: ??? model: labels: ${labels} train_ds: dataset: _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset" manifest_filepath: ${train_dataset} max_duration: null min_duration: 0.1 trim: false int_values: false normalize: true sample_rate: ${sample_rate} # bos_id: 66 # eos_id: 67 # pad_id: 68 These parameters are added automatically in Tacotron2 dataloader_params: drop_last: false shuffle: true batch_size: 48 num_workers: 4 validation_ds: dataset: _target_: "nemo.collections.asr.data.audio_to_text.AudioToCharDataset" manifest_filepath: ${validation_datasets} max_duration: null min_duration: 0.1 int_values: false normalize: true sample_rate: ${sample_rate} trim: false # bos_id: 66 # eos_id: 67 # pad_id: 68 These parameters are added automatically in Tacotron2 dataloader_params: drop_last: false shuffle: false batch_size: 48 num_workers: 8 preprocessor: _target_: nemo.collections.asr.parts.features.FilterbankFeatures dither: 0.0 nfilt: ${n_mels} frame_splicing: 1 highfreq: ${fmax} log: true log_zero_guard_type: clamp log_zero_guard_value: 1e-05 lowfreq: 0 mag_power: 1.0 n_fft: ${n_fft} n_window_size: 1024 n_window_stride: ${n_stride} normalize: null pad_to: 16 pad_value: ${pad_value} preemph: null sample_rate: ${sample_rate} window: hann encoder: _target_: nemo.collections.tts.modules.tacotron2.Encoder encoder_kernel_size: 5 encoder_n_convolutions: 3 encoder_embedding_dim: 512 decoder: _target_: nemo.collections.tts.modules.tacotron2.Decoder decoder_rnn_dim: 1024 encoder_embedding_dim: ${model.encoder.encoder_embedding_dim} gate_threshold: 0.5 max_decoder_steps: 1000 n_frames_per_step: 1 # currently only 1 is supported n_mel_channels: ${n_mels} p_attention_dropout: 0.1 p_decoder_dropout: 0.1 prenet_dim: 256 prenet_p_dropout: 0.5 # Attention parameters attention_dim: 128 attention_rnn_dim: 1024 # AttentionLocation Layer parameters attention_location_kernel_size: 31 attention_location_n_filters: 32 early_stopping: true postnet: _target_: nemo.collections.tts.modules.tacotron2.Postnet n_mel_channels: ${n_mels} p_dropout: 0.5 postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 optim: name: adam lr: 1e-3 weight_decay: 1e-6 # scheduler setup sched: name: CosineAnnealing min_lr: 1e-5 trainer: gpus: 1 # number of gpus max_epochs: ??? num_nodes: 1 accelerator: ddp accumulate_grad_batches: 1 checkpoint_callback: False # Provided by exp_manager logger: False # Provided by exp_manager gradient_clip_val: 1.0 flush_logs_every_n_steps: 1000 log_every_n_steps: 200 check_val_every_n_epoch: 25 exp_manager: exp_dir: null name: ${name} create_tensorboard_logger: True create_checkpoint_callback: True