# Author: Sjoerd de Vries, CNRS. Copyright 2023-2024.

import sys
import numpy as np

atom13_G = [4.66517696, -0.69231318, 0.03220869]
atom13_G[2] = 0  # impose base planarity
atom13_G = np.array(atom13_G)


def mutate_AtoG(input, output):
    """mutates A to G in output"""
    assert input.shape == (22, 3), input.shape
    assert output.shape == (23, 3), output.shape
    v1 = input[11] - input[8]
    v2 = input[13] - input[8]
    v3 = np.cross(v1, v2)
    v2a = np.cross(v3, v1)
    x = v1 / np.linalg.norm(v1)
    y = v2a / np.linalg.norm(v2a)
    z = v3 / np.linalg.norm(v3)
    mat = np.stack((x, y, z))

    output[:12] = input[:12]
    output[13:] = input[12:]
    atom13 = atom13_G.dot(mat) + input[8]
    output[12] = atom13


rna_sizes = {"A": 22, "C": 20, "G": 23, "U": 20}


def mutate(arr: np.ndarray, input_sequence: str, output_sequence: str) -> np.ndarray:
    """Mutate heavy-atom fragment or fragment library, from A/C to A/C/G/U.

    arr: Input fragment coordinates in .npy format.

    The dtype must be float, and the shape must be (X, 3) or (Y, X, 3).
    X is the number of heavy atoms in the fragment, which must match with the sequence:
    22 atoms for each A, 20 atoms for each C.
    This is what is generated by ATTRACT aareduce, using the OPLSX force field for RNA.

    input_sequence: Sequence of the input fragment(s). Must consist of A and/or C",
    output_sequence: Sequence of the output fragment(s). Must consist of A, C, G and/or U".

    Returns: Output fragment coordinates in .npy format.
    The shape will be the same as the input data, but with one extra atom for each G."""

    if len(input_sequence) != len(output_sequence):
        err("Input and output sequence are not of the same length")

    totsize = 0
    for char in input_sequence:
        if char not in ("A", "C"):
            err("Input sequence must contain only A and C")
        totsize += rna_sizes[char]

    output_totsize = 0
    for char in output_sequence:
        if char not in ("A", "C", "G", "U"):
            err("Output sequence must contain only A, C, G, U")
        output_totsize += rna_sizes[char]

    if arr.ndim not in (2, 3) or arr.shape[-1] != 3:
        err(f"Input data has the wrong shape: {arr.shape}")

    if arr.shape[-2] != totsize:
        err(
            f"Input data has the wrong number of atoms for the sequence: {arr.shape[-2]} instead of {totsize}"
        )
    if arr.ndim == 2:
        inp = arr[None, :, :]
    else:
        inp = arr
    outp = np.empty((inp.shape[0], output_totsize, 3), inp.dtype)

    offset = 0
    output_offset = 0

    for nuc1, nuc2 in zip(input_sequence, output_sequence):
        size = rna_sizes[nuc1]
        output_size = rna_sizes[nuc2]
        for frag in range(len(inp)):
            curr_inp = inp[frag, offset : offset + size]
            curr_outp = outp[frag, output_offset : output_offset + output_size]
            if nuc1 != nuc2:
                if (nuc1, nuc2) not in (("A", "G"), ("C", "U")):
                    err(f"Cannot mutate {nuc1} to {nuc2}")
                if (nuc1, nuc2) == ("A", "G"):
                    mutate_AtoG(curr_inp, curr_outp)
                elif (nuc1, nuc2) == ("C", "U"):
                    curr_outp[:] = curr_inp  # equivalent atoms in the same order
            else:
                curr_outp[:] = curr_inp  # no change
        offset += size
        output_offset += output_size

    if arr.ndim == 2:
        outp = outp[0]
    return outp


def mutate_to_AC(
    arr: np.ndarray,
    input_sequence: str,
) -> tuple[np.ndarray, str]:
    """Mutate heavy-atom fragment or fragment library, from A/C/G/U to A/C.

    arr: Input fragment coordinates in .npy format.

    The dtype must be float, and the shape must be (X, 3) or (Y, X, 3).
    X is the number of heavy atoms in the fragment, which must match with the sequence:
    22 atoms for each A, 20 atoms for each C, 23 atoms for each G, 20 atoms for each U.
    This is what is generated by ATTRACT aareduce, using the OPLSX force field for RNA.

    input_sequence: Sequence of the input fragment(s). Must consist of A, C, G and/or U".

    Returns:
        - Output fragment coordinates in .npy format.
            The shape will be the same as the input data, but with one less atom for each G.
    """

    totsize = 0
    output_totsize = 0
    output_sequence = ""
    for char in input_sequence:
        if char not in ("A", "C", "G", "U"):
            err("Input sequence must contain only A, C, G, U")
        rna_size = rna_sizes[char]
        totsize += rna_size

        output_totsize += rna_size
        if char == "G":
            output_totsize -= 1
            char = "A"
        elif char == "U":
            char = "C"
        output_sequence = output_sequence + char

    if arr.ndim not in (2, 3) or arr.shape[-1] != 3:
        err(f"Input data has the wrong shape: {arr.shape}")

    if arr.shape[-2] != totsize:
        err(
            f"Input data has the wrong number of atoms for the sequence: {arr.shape[-2]} instead of {totsize}"
        )
    if arr.ndim == 2:
        inp = arr[None, :, :]
    else:
        inp = arr
    outp = np.empty((inp.shape[0], output_totsize, 3), inp.dtype)

    offset = 0
    output_offset = 0

    for nuc1, nuc2 in zip(input_sequence, output_sequence):
        size = rna_sizes[nuc1]
        output_size = rna_sizes[nuc2]
        for frag in range(len(inp)):
            curr_inp = inp[frag, offset : offset + size]
            curr_outp = outp[frag, output_offset : output_offset + output_size]
            if nuc1 != nuc2:
                if (nuc1, nuc2) not in (("G", "A"), ("U", "C")):
                    err(f"Cannot mutate {nuc1} to {nuc2}")
                if (nuc1, nuc2) == ("G", "A"):
                    curr_outp[:12] = curr_inp[:12]
                    curr_outp[12:] = curr_inp[13:]
                elif (nuc1, nuc2) == ("U", "C"):
                    curr_outp[:] = curr_inp  # equivalent atoms in the same order
            else:
                curr_outp[:] = curr_inp  # no change
        offset += size
        output_offset += output_size

    if arr.ndim == 2:
        outp = outp[0]
    return outp, output_sequence


if __name__ == "__main__":

    def err(*args, **kwargs):
        print(*args, **kwargs, file=sys.stderr)
        exit(1)

    import argparse

    parser = argparse.ArgumentParser(
        description="Mutate heavy-atom fragment or fragment library, from A/C to A/C/G/U"
    )
    parser.add_argument(
        "input_sequence",
        help="Sequence of the input fragment(s). Must consist of A and/or C",
    )
    parser.add_argument(
        "output_sequence",
        help="Sequence of the output fragment(s). Must consist of A, C, G and/or U",
    )
    parser.add_argument(
        "input_data",
        type=argparse.FileType("rb"),
        help="""Input fragment coordinates in .npy format.
The dtype must be float, and the shape must be (X, 3) or (Y, X, 3).
X is the number of heavy atoms in the fragment, which must match with the sequence:
22 atoms for each A, 20 atoms for each C.""",
    )
    parser.add_argument(
        "output_data",
        type=argparse.FileType("wb"),
        help="""Output fragment coordinates in .npy format.
The shape will be the same as the input data, but with one extra atom for each G.""",
    )

    args = parser.parse_args()
    input_sequence = args.input_sequence
    output_sequence = args.output_sequence
    arr = np.load(args.input_data)
    outp = mutate(arr, input_sequence, output_sequence)

    np.save(args.output_data, outp)