#!/usr/bin/env python2 import argparse import sys import os import shutil import glob import re import logging args = None def parse_args(): logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") parser = argparse.ArgumentParser( description='clustering files by regular expression [V3.0]', epilog="https://github.com/shenwei356/easy_qsub") parser.add_argument('indir', type=str, help='source directory') parser.add_argument('-o', '--outdir', type=str, help='out directory [.cluster]') parser.add_argument( '-p', '--pattern', type=str, help='pattern (regular expression) of files in indir. ' + 'if not given, it will be the longest common substring of the files.' + 'GROUP (parenthese) should be in the regular expression. ' + 'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"') parser.add_argument('-k', '--keep', action='store_true', help='keep original dir structure') parser.add_argument('-m', '--mv', action='store_true', help='moving files instead of creating symbolic links') parser.add_argument( "-f", "--force", action="store_true", help='force file overwriting, i.e. deleting existed out directory') args = parser.parse_args() args.indir = os.path.normpath(args.indir) if not args.outdir: args.outdir = os.path.normpath(args.indir) + '.cluster' args.outdir = os.path.normpath(args.outdir) return args def longest_common_substring(s1, s2): m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))] longest, x_longest = 0, 0 for x in range(1, 1 + len(s1)): for y in range(1, 1 + len(s2)): if s1[x - 1] == s2[y - 1]: m[x][y] = m[x - 1][y - 1] + 1 if m[x][y] > longest: longest = m[x][y] x_longest = x else: m[x][y] = 0 return s1[x_longest - longest:x_longest] if __name__ == '__main__': args = parse_args() targets = list() if args.pattern: try: pattern = re.compile(args.pattern) except: logging.error("illegal regular expression: {}".format( args.pattern)) sys.exit(1) if not ('(' in args.pattern and ')' in args.pattern): logging.error( 'GROUP (parenthese) should be in the regular expression. ' + 'Captured group will be the cluster name. e.g. "(.+?)_\d\.fq\.gz"') sys.exit(1) def walk_func(_, dir, files): basenames = [os.path.basename(file) for file in files if not file.startswith('.')] # ignore .file if not args.pattern: lcs = basenames[0] for file in basenames[1:]: lcs = longest_common_substring(lcs, file) lcs = lcs.lstrip('.') if lcs == '': return clusters = set([lcs]) files = [os.path.join(dir, file) for file in files if lcs in file] else: clusters = set(pattern.findall(file)[0] for file in basenames if pattern.search(file)) if len(clusters) == 0: return files = [os.path.join(dir, file) for file in files if pattern.search(file)] targets.append([dir, clusters, files]) os.path.walk(args.indir, walk_func, ()) if len(targets) == 0: logging.error('no files match pattern: {}'.format(args.pattern)) sys.exit(1) if os.path.exists(args.outdir): if args.force: shutil.rmtree(args.outdir) else: logging.info("update existed directory: {}".format(args.outdir)) for dir, clusters, files in targets: for cluster in clusters: if args.keep: splits = os.path.split(dir) if not splits[0] == '': newdir = os.path.join(splits[1:])[0] else: # no subdir in args.indir newdir = '' else: newdir = '' outdir = os.path.join(args.outdir, newdir, cluster) if os.path.exists(outdir): if args.force: shutil.rmtree(outdir) else: logging.info("ignore existed directory: {}".format(outdir)) continue logging.info("create new directory: {}".format(outdir)) os.makedirs(outdir) for file in files: if pattern.findall(os.path.basename(file))[0] != cluster: continue if args.mv: # moving file shutil.move(file, outdir) else: # creating symbolic links link_name = os.path.join(outdir, os.path.basename(file)) source = os.path.relpath(os.path.abspath(file), outdir) os.symlink(source, link_name)