###
# Casanovo configuration.
# Blank entries are interpreted as `None`.
###

###
# The following parameters can be modified when running inference or when
# fine-tuning an existing Casanovo model. They also affect database search
# parameters when running Casanovo in database search mode.
###

# Max absolute difference allowed with respect to observed precursor m/z.
# de novo: Predictions outside the tolerance range are assigned a negative
# peptide score.
# database search: Select candidate peptides within the specified precursor m/z
# tolerance.
precursor_mass_tol: 50  # ppm
# Isotopes to consider when comparing predicted and observed precursor m/z's.
isotope_error_range: [0, 1]
# The minimum length of considered peptides.
min_peptide_len: 6
# The maximum length of considered peptides.
max_peptide_len: 100
# Number of spectra in one inference batch.
predict_batch_size: 1024
# Number of PSMs for each spectrum.
top_match: 1
# The hardware accelerator to use. Must be one of:
# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto".
accelerator: "auto"
# The devices to use. Can be set to a positive number int, or the value -1 to
# indicate all available devices should be used. If left empty, the appropriate
# number will be automatically selected for based on the chosen accelerator.
devices:


###
# The following parameters are unique to Casanovo's de novo sequencing mode.
###

# Number of beams used in beam search.
n_beams: 1


###
# The following parameters are unique to Casanovo's database search mode.
###

# Enzyme for in silico digestion, used to generate candidate peptides.
# See pyteomics.parser.expasy_rules for valid enzymes.
# Can also take a regex to specify custom digestion rules.
enzyme: "trypsin"
# Digestion type for candidate peptide generation.
# - full: Standard digestion.
# - semi: Include products of semi-specific cleavage.
# - non-specific: Include products of non-specific cleavage.
digestion: "full"
# Number of allowed missed cleavages when digesting proteins.
missed_cleavages: 0
# Maximum number of variable amino acid modifications per peptide,
# `None` generates all possible isoforms as candidates.
max_mods: 1
# Select which modifications from the vocabulary can be used in candidate
# creation.
# Format: Comma-separated list of "aa:mod_residue", where `aa` is a standard
# amino acid (or "nterm" for an N-terminal mod) and `mod_residue` is a key from
# the "residues" dictionary.
# Example: "M:M[Oxidation],nterm:[Carbamyl]-"
allowed_fixed_mods: "C:C[Carbamidomethyl]"
allowed_var_mods: "M:M[Oxidation],N:N[Deamidated],Q:Q[Deamidated],nterm:[Acetyl]-,nterm:[Carbamyl]-,nterm:[Ammonia-loss]-,nterm:[+25.980265]-"


###
# The following parameters should only be modified if you are training a new
# Casanovo model from scratch.
###

# Random seed to ensure reproducible results.
random_seed: 454

# OUTPUT OPTIONS
# Logging frequency in training steps.
n_log: 1
# Whether to create Tensorboard directory.
tb_summarywriter: false
# Whether to create csv_logs directory.
log_metrics: false
# How often to log optimizer parameters in steps.
log_every_n_steps: 50
# Path to save Lance instances.
lance_dir:
# Model validation and checkpointing frequency in training steps.
val_check_interval: 50_000

# SPECTRUM PROCESSING OPTIONS
# Minimum number of peaks for a spectrum to be considered valid.
min_peaks: 20
# Maximum number of the most intense peaks to retain, any remaining peaks are
# discarded.
max_peaks: 150
# Min peak m/z allowed, peaks with smaller m/z are discarded.
min_mz: 50.0
# Max peak m/z allowed, peaks with larger m/z are discarded.
max_mz: 2500.0
# Min peak intensity allowed, less intense peaks are discarded.
min_intensity: 0.01
# Max absolute m/z difference allowed when removing the precursor peak.
remove_precursor_tol: 2.0  # Da
# Max precursor charge allowed, spectra with larger charge are skipped.
max_charge: 4

# MODEL ARCHITECTURE OPTIONS
# Dimensionality of latent representations, i.e. peak embeddings.
dim_model: 512
# Number of attention heads.
n_head: 8
# Dimensionality of fully connected layers.
dim_feedforward: 1024
# Number of transformer layers in spectrum encoder and peptide decoder.
n_layers: 9
# Dropout rate for model weights.
dropout: 0.0
# Number of dimensions to use for encoding peak intensity.
dim_intensity:
# The number of iterations for the linear warm-up of the learning rate.
warmup_iters: 100_000
# The number of iterations for the cosine half period of the learning rate.
cosine_schedule_period_iters: 600_000
# Learning rate for weight updates during training.
learning_rate: 5e-4
# Regularization term for weight updates.
weight_decay: 1e-5
# Amount of label smoothing when computing the training loss.
train_label_smoothing: 0.01

# TRAINING/INFERENCE OPTIONS
# Number of spectra in one training batch.
train_batch_size: 32
# Max number of training epochs.
max_epochs: 30
# Shuffle dataset during training.
shuffle: true
# Number of samples to buffer while randomly shuffling the training data.
shuffle_buffer_size: 10_000
# Number of validation steps to run before training begins.
num_sanity_val_steps: 0
# Calculate peptide and amino acid precision during training.
# This is expensive, so we recommend against it.
calculate_precision: false

# Accumulates gradients over `k` batches before stepping the optimizer.
accumulate_grad_batches: 1
# The value at which to clip gradients. `None` disables gradient clipping.
gradient_clip_val:
# The gradient clipping algorithm to use.
# Must be one of: "value", "norm", or `None`.
gradient_clip_algorithm:
# Floating point precision.
# Must be one of: "16-true", "16-mixed", "bf16-true", "bf16-mixed", "32-true",
# "64-true", "64", "32", "16", or "bf16".
precision: "32-true"

# Replace I by L in peptide sequences.
replace_isoleucine_with_leucine: true
# Reverse peptide sequences.
reverse_peptides: true
# MassIVE-KB style tokenizer, otherwise ProForma syntax.
massivekb_tokenizer: false

# AMINO ACID AND MODIFICATION VOCABULARY
residues:
  "G": 57.021464
  "A": 71.037114
  "S": 87.032028
  "P": 97.052764
  "V": 99.068414
  "T": 101.047670
  "C[Carbamidomethyl]": 160.030649 # 103.009185 + 57.021464
  "L": 113.084064
  "I": 113.084064
  "N": 114.042927
  "D": 115.026943
  "Q": 128.058578
  "K": 128.094963
  "E": 129.042593
  "M": 131.040485
  "H": 137.058912
  "F": 147.068414
  "R": 156.101111
  "Y": 163.063329
  "W": 186.079313
  # Amino acid modifications.
  "M[Oxidation]": 147.035400      # Met oxidation:   131.040485 + 15.994915
  "N[Deamidated]": 115.026943     # Asn deamidation: 114.042927 +  0.984016
  "Q[Deamidated]": 129.042594     # Gln deamidation: 128.058578 +  0.984016
  # N-terminal modifications.
  "[Acetyl]-": 42.010565                  # Acetylation
  "[Carbamyl]-": 43.005814                # Carbamylation
  "[Ammonia-loss]-": -17.026549           # Ammonia loss
  "[+25.980265]-": 25.980265              # Carbamylation and ammonia loss
  #"[Carbamyl][Ammonia-loss]-": 25.980265  # Carbamylation and ammonia loss