# Corpus opts: data: corpus1: path_src: "train.es" path_tgt: "train.hr" transforms: [sentencepiece, prefix, suffix, filtertoolong] weight: 10 src_prefix: "spa_Latn" tgt_prefix: "hrv_Latn" src_suffix: "" tgt_suffix: "" src_vocab: "shared_vocabulary.txt" src_words_min_frequency: 1 src_vocab_size: 256206 tgt_vocab: "shared_vocabulary.txt" tgt_words_min_frequency: 1 tgt_vocab_size: 256206 vocab_size_multiple: 1 decoder_start_token: '' #### Subword src_subword_model: "flores200_sacrebleu_tokenizer_spm.model" tgt_subword_model: "flores200_sacrebleu_tokenizer_spm.model" src_subword_nbest: 1 src_subword_alpha: 0.0 tgt_subword_nbest: 1 tgt_subword_alpha: 0.0 update_vocab: true train_from: "nllb-200-600M-onmt.pt" reset_optim: all save_data: "./finetuned600M" save_model: "./finetuned600M/nllb-200-600-FT-onmt" log_file: "./finetuned600M/nllb-200-600-FT-onmt.log" keep_checkpoint: 50 save_checkpoint_steps: 100 average_decay: 0.0005 seed: 1234 report_every: 10 train_steps: 2000 valid_steps: 100 # Batching bucket_size: 262144 num_workers: 4 prefetch_factor: 400 world_size: 1 gpu_ranks: [0] batch_type: "tokens" batch_size: 384 valid_batch_size: 384 batch_size_multiple: 1 accum_count: [32, 32, 32] accum_steps: [0, 15000, 30000] # Optimization model_dtype: "fp16" optim: "sgd" learning_rate: 30 warmup_steps: 100 decay_method: "noam" adam_beta2: 0.98 max_grad_norm: 0 label_smoothing: 0.1 param_init: 0 param_init_glorot: true normalization: "tokens" # Model override_opts: true encoder_type: transformer decoder_type: transformer enc_layers: 12 dec_layers: 12 heads: 16 hidden_size: 1024 word_vec_size: 1024 transformer_ff: 4096 add_qkvbias: true add_ffnbias: true dropout_steps: [0, 15000, 30000] dropout: [0.1, 0.1, 0.1] attention_dropout: [0.1, 0.1, 0.1] share_decoder_embeddings: true share_embeddings: true position_encoding: true position_encoding_type: 'SinusoidalConcat'