#! python

# encoding: utf-8
# Wellcome Trust Sanger Institute and Imperial College London
# Copyright (C) 2020 Imperial College London and Imperial College London
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#

# Generic imports
import sys
import os
import argparse
import subprocess
from multiprocessing import Pool
from functools import partial
from shutil import which

# command line parsing
def get_options():

    parser = argparse.ArgumentParser(description='Generate a ska alignment from a list '
                                                    'of assemblies',
                                     prog='generate_ska_alignment')

    # input options
    parser.add_argument('--reference',
                        help = 'Name of reference sequence to use for alignment',
                        required = True)
    parser.add_argument('--fasta',
                        help = 'Two column list of names and FASTA files to include in alignment',
                        default = None,
                        required = False)
    parser.add_argument('--fastq',
                        help = 'Two/three column list of names and of FASTQ files to include in alignment',
                        default = None,
                        required = False)
    parser.add_argument('--out',
                        help = 'Name of output alignment',
                        required = True)
    parser.add_argument('--k',
                        help = 'Split kmer size',
                        type = int,
                        default = 15)
    parser.add_argument('--threads',
                        help = 'Number of threads to use',
                        type = int,
                        default = 1)
    parser.add_argument('--no-cleanup',
                        help = 'Do not remove intermediate files',
                        action = 'store_true',
                        default = False)

    return parser.parse_args()

def map_fasta_sequence(seq, k = None, names = None):
    subprocess.check_output('ska fasta -o ' + names[seq]  + ' -k ' + str(k) + ' ' + seq,
                            shell = True)

def map_fastq_sequence(read_pair, k = None, names = None):
    subprocess.check_output('ska fastq -o ' + names[read_pair[0]]  + ' -k ' + str(k) + ' ' + \
                            read_pair[0] + ' ' + read_pair[1],
                            shell = True)

def ska_map_sequences(seq, k = None, ref = None):
    subprocess.check_output('ska map -o ' + seq + '.map -k ' + str(k) + ' -r ' + ref + \
                            ' ' + seq + '.skf',
                            shell = True)

# main code
if __name__ == "__main__":

    # Get command line options
    args = get_options()

    # Check if ska is installed
    if which('ska') is None:
        sys.stderr.write('SKA cannot be found on PATH; install with "conda install ska"')
        sys.exit(1)

    # Dictionary for sequence names
    seq_names = {}
    all_names = []

    # Make split kmers from assemblies
    fasta_names = []
    if args.fasta is not None:
        # Read in FASTA assemblies
        with open(args.fasta,'r') as fasta_list:
            for line in fasta_list.readlines():
                info = line.strip().split()
                if os.path.isfile(info[1]):
                    fasta_names.append(info[1])
                    seq_names[info[1]] = info[0]
                    all_names.append(info[0])
                else:
                    sys.stderr.write('Unable to find file ' + info[1] + '\n')
        # Sketch into split kmers
        with Pool(processes = args.threads) as pool:
            pool.map(partial(map_fasta_sequence,
                                k = args.k,
                                names = seq_names),
                                fasta_names)
    
    # Make split kmers from FASTQs
    fastq_names = []
    if args.fastq is not None:
        # Read in FASTQ reads
        with open(args.fastq,'r') as fastq_list:
            for line in fastq_list.readlines():
                info = line.strip().split()
                if os.path.isfile(fns[1]) and os.path.isfile(fns[2]):
                    fastq_names.append((info[1],info[2]))
                    seq_names[info[1]] = info[0]
                    all_names.append(info[0])
                else:
                    sys.stderr.write('Unable to find files ' + fns[1] + ' and ' + fns[2] + '\n')
        # Sketch into split kmers
        with Pool(processes = args.threads) as pool:
            return_codes = pool.map(partial(map_fastq_sequence,
                                            k = args.k,
                                            names = seq_names),
                                            fastq_names)
    
    # Map sequences
    with Pool(processes = args.threads) as pool:
        return_codes = pool.map(partial(ska_map_sequences,
                                        k = args.k,
                                        ref = args.reference),
                                        all_names)

    # Generate alignment
    subprocess.check_output('cat ' + ' '.join([seq + '.map.aln' for seq in all_names]) + ' > ' + args.out,
                            shell = True)

    # Clean up
    if not args.no_cleanup:
        subprocess.check_output('rm ' + ' '.join([seq + '.map.aln' for seq in all_names]),
                                shell = True)
        subprocess.check_output('rm ' + ' '.join([seq + '.skf' for seq in all_names]),
                                shell = True)