#!/usr/bin/env python2

import argparse
import sys
import os
import shutil
import glob
import re
import logging

args = None


def parse_args():
    logging.basicConfig(level=logging.INFO,
                        format="[%(levelname)s] %(message)s")

    parser = argparse.ArgumentParser(
        description='clustering files by regular expression [V3.0]',
        epilog="https://github.com/shenwei356/easy_qsub")

    parser.add_argument('indir', type=str, help='source directory')
    parser.add_argument('-o',
                        '--outdir',
                        type=str,
                        help='out directory [<indir>.cluster]')
    parser.add_argument(
        '-p',
        '--pattern',
        type=str,
        help='pattern (regular expression) of files in indir. ' +
        'if not given, it will be the longest common substring of the files.' +
        'GROUP (parenthese) should be in the regular expression. ' +
        'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"')
    parser.add_argument('-k',
                        '--keep',
                        action='store_true',
                        help='keep original dir structure')
    parser.add_argument('-m',
                        '--mv',
                        action='store_true',
                        help='moving files instead of creating symbolic links')
    parser.add_argument(
        "-f",
        "--force",
        action="store_true",
        help='force file overwriting, i.e. deleting existed out directory')

    args = parser.parse_args()

    args.indir = os.path.normpath(args.indir)

    if not args.outdir:
        args.outdir = os.path.normpath(args.indir) + '.cluster'
    args.outdir = os.path.normpath(args.outdir)

    return args


def longest_common_substring(s1, s2):
    m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(s1)):
        for y in range(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return s1[x_longest - longest:x_longest]


if __name__ == '__main__':
    args = parse_args()

    targets = list()

    if args.pattern:
        try:
            pattern = re.compile(args.pattern)
        except:
            logging.error("illegal regular expression: {}".format(
                args.pattern))
            sys.exit(1)

        if not ('(' in args.pattern and ')' in args.pattern):
            logging.error(
                'GROUP (parenthese) should be in the regular expression. ' +
                'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"')
            sys.exit(1)

    def walk_func(_, dir, files):
        basenames = [os.path.basename(file) for file in files
                     if not file.startswith('.')]  # ignore .file

        if not args.pattern:
            lcs = basenames[0]
            for file in basenames[1:]:
                lcs = longest_common_substring(lcs, file)
            lcs = lcs.lstrip('.')
            if lcs == '':
                return
            clusters = set([lcs])
            files = [os.path.join(dir, file) for file in files if lcs in file]
        else:
            clusters = set(pattern.findall(file)[0] for file in basenames
                           if pattern.search(file))
            if len(clusters) == 0:
                return
            files = [os.path.join(dir, file) for file in files
                     if pattern.search(file)]

        targets.append([dir, clusters, files])

    os.path.walk(args.indir, walk_func, ())

    if len(targets) == 0:
        logging.error('no files match pattern: {}'.format(args.pattern))
        sys.exit(1)

    if os.path.exists(args.outdir):
        if args.force:
            shutil.rmtree(args.outdir)
        else:
            logging.info("update existed directory: {}".format(args.outdir))

    for dir, clusters, files in targets:
        for cluster in clusters:
            if args.keep:
                splits = os.path.split(dir)
                if not splits[0] == '':
                    newdir = os.path.join(splits[1:])[0]
                else:  # no subdir in args.indir
                    newdir = ''
            else:
                newdir = ''

            outdir = os.path.join(args.outdir, newdir, cluster)
            if os.path.exists(outdir):
                if args.force:
                    shutil.rmtree(outdir)
                else:
                    logging.info("ignore existed directory: {}".format(outdir))
                    continue
            logging.info("create new directory: {}".format(outdir))
            os.makedirs(outdir)

            for file in files:
                if pattern.findall(os.path.basename(file))[0] != cluster:
                    continue
                if args.mv:  # moving file
                    shutil.move(file, outdir)
                else:  # creating symbolic links
                    link_name = os.path.join(outdir, os.path.basename(file))
                    source = os.path.relpath(os.path.abspath(file), outdir)
                    os.symlink(source, link_name)