#!/usr/bin/env python3 # Copyright (c) 2016, Antonio SJ Musumeci # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. import argparse import ctypes import errno import fnmatch import hashlib import io import os import random import shlex import sys _libc = ctypes.CDLL("libc.so.6",use_errno=True) _lgetxattr = _libc.lgetxattr _lgetxattr.argtypes = [ctypes.c_char_p,ctypes.c_char_p,ctypes.c_void_p,ctypes.c_size_t] def lgetxattr(path,name): if type(path) == str: path = path.encode(errors='backslashreplace') if type(name) == str: name = name.encode(errors='backslashreplace') length = 64 while True: buf = ctypes.create_string_buffer(length) res = _lgetxattr(path,name,buf,ctypes.c_size_t(length)) if res >= 0: return buf.raw[0:res] else: err = ctypes.get_errno() if err == errno.ERANGE: length *= 2 elif err == errno.ENODATA: return None else: raise IOError(err,os.strerror(err),path) def ismergerfs(path): try: lgetxattr(path,b'user.mergerfs.fullpath') return True except IOError as e: return False def hash_file(filepath, hasher=None, blocksize=65536): if not hasher: hasher = hashlib.md5() with open(filepath,'rb') as afile: buf = afile.read(blocksize) while buf: hasher.update(buf) buf = afile.read(blocksize) return hasher.hexdigest() def short_hash_file(filepath, hasher=None, blocksize=65536, blocks=16): if not hasher: hasher = hashlib.md5() with open(filepath,'rb') as f: size = os.fstat(f.fileno()).st_size if size <= blocksize: size = 1 blocks = 1 random.seed(size,version=2) for _ in range(blocks): offset = random.randrange(size) f.seek(offset) buf = f.read(blocksize) if buf: hasher.update(buf) else: break return hasher.hexdigest() def sizeof_fmt(num): for unit in ['','K','M','G','T','P','E','Z']: if abs(num) < 1024.0: return "%3.1f%sB" % (num,unit) num /= 1024.0 return "%.1f%sB" % (num,'Y') def stat_files(paths): rv = [] for path in paths: try: st = os.stat(path) rv.append((path,st)) except: pass return rv def remove(files,execute,verbose): for (path,stat) in files: try: print('rm -vf',shlex.quote(path)) if execute: os.remove(path) except Exception as e: print("%s" % e) def print_stats(stats): for i in range(0,len(stats)): print("# %i: %s" % (i+1,stats[i][0])) data = ("# - uid: {0:5}; gid: {1:5}; mode: {2:6o}; " "size: {3}; mtime: {4}").format( stats[i][1].st_uid, stats[i][1].st_gid, stats[i][1].st_mode, sizeof_fmt(stats[i][1].st_size), stats[i][1].st_mtime) print(data) def total_size(stats): total = 0 for (name,stat) in stats: total = total + stat.st_size return total def manual_dedup(fullpath,stats): done = False while not done: value = input("# Which to keep? ('s' to skip):") if value.lower() == 's': stats.clear() done = True continue try: value = int(value) - 1 if value < 0 or value >= len(stats): raise ValueError stats.remove(stats[value]) done = True except NameError: print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats))) except ValueError: print("Input error: enter a value [1-{0}] or skip by entering 's'".format(len(stats))) def mtime_all(stats): mtime = stats[0][1].st_mtime return all(x[1].st_mtime == mtime for x in stats) def mtime_any(mtime,stats): return any([st.st_mtime == mtime for (path,st) in stats]) def size_all(stats): size = stats[0][1].st_size return all(x[1].st_size == size for x in stats) def size_any(size,stats): return any([st.st_size == size for (path,st) in stats]) def md5sums_all(stats): if size_all(stats): hashval = hash_file(stats[0][0]) return all(hash_file(path) == hashval for (path,st) in stats[1:]) return False def short_md5sums_all(stats): if size_all(stats): hashval = short_hash_file(stats[0][0]) return all(short_hash_file(path) == hashval for (path,st) in stats[1:]) return False def oldest_dedup(fullpath,stats): if size_all(stats) and mtime_all(stats): drive_with_most_space_dedup(fullpath,stats) return stats.sort(key=lambda st: st[1].st_mtime) oldest = stats[0] stats.remove(oldest) def strict_oldest_dedup(fullpath,stats): stats.sort(key=lambda st: st[1].st_mtime,reverse=False) oldest = stats[0] stats.remove(oldest) if mtime_any(oldest[1].st_mtime,stats): stats.clear() def newest_dedup(fullpath,stats): if size_all(stats) and mtime_all(stats): drive_with_most_space_dedup(fullpath,stats) return stats.sort(key=lambda st: st[1].st_mtime,reverse=True) newest = stats[0] stats.remove(newest) def strict_newest_dedup(fullpath,stats): stats.sort(key=lambda st: st[1].st_mtime,reverse=True) newest = stats[0] stats.remove(newest) if mtime_any(newest[1].st_mtime,stats): stats.clear() def largest_dedup(fullpath,stats): if size_all(stats) and mtime_all(stats): drive_with_most_space_dedup(fullpath,stats) return stats.sort(key=lambda st: st[1].st_size,reverse=True) largest = stats[0] stats.remove(largest) def strict_largest_dedup(fullpath,stats): stats.sort(key=lambda st: st[1].st_size,reverse=True) largest = stats[0] stats.remove(largest) if size_any(largest[1].st_size,stats): stats.clear() def smallest_dedup(fullpath,stats): if size_all(stats) and mtime_all(stats): drive_with_most_space_dedup(fullpath,stats) return stats.sort(key=lambda st: st[1].st_size) smallest = stats[0] stats.remove(smallest) def strict_smallest_dedup(fullpath,stats): stats.sort(key=lambda st: st[1].st_size,reverse=False) smallest = stats[0] stats.remove(smallest) if size_any(smallest[1].st_size,stats): stats.clear() def calc_space_free(stat): st = os.statvfs(stat[0]) return st.f_frsize * st.f_bfree def drive_with_most_space_dedup(fullpath,stats): stats.sort(key=calc_space_free,reverse=True) largest = stats[0] stats.remove(largest) def mergerfs_getattr_dedup(origpath,stats): fullpath = getxattr(origpath,b'user.mergerfs.fullpath') for (path,stat) in stats: if path != fullpath: continue stats.remove((path,stat)) break def get_dedupfun(name,strict): if strict: name = 'strict-' + name funs = { 'manual': manual_dedup, 'strict-manual': manual_dedup, 'mostfreespace': drive_with_most_space_dedup, 'strict-mostfreespace': drive_with_most_space_dedup, 'newest': newest_dedup, 'strict-newest': strict_newest_dedup, 'oldest': oldest_dedup, 'strict-oldest': strict_oldest_dedup, 'largest': largest_dedup, 'strict-largest': strict_largest_dedup, 'smallest': smallest_dedup, 'strict-smallest': strict_smallest_dedup, 'mergerfs': mergerfs_getattr_dedup, 'strict-mergerfs': mergerfs_getattr_dedup } return funs[name] def get_ignorefun(name): funs = { None: lambda x: None, 'same-time': mtime_all, 'diff-time': lambda x: not mtime_all(x), 'same-size': size_all, 'diff-size': lambda x: not size_all(x), 'same-hash': md5sums_all, 'diff-hash': lambda x: not md5sums_all(x), 'same-short-hash': short_md5sums_all, 'diff-short-hash': lambda x: not short_md5sums_all(x) } return funs[name] def getxattr(path,key): try: attr = lgetxattr(path,key) if attr: return attr.decode('utf-8') return '' except IOError as e: if e.errno == errno.ENODATA: return '' raise except UnicodeDecodeError as e: print(e) print(attr) return '' def match(filename,matches): for match in matches: if fnmatch.fnmatch(filename,match): return True return False def dedup(fullpath,verbose,ignorefun,execute,dedupfun): paths = getxattr(fullpath,b'user.mergerfs.allpaths').split('\0') if len(paths) <= 1: return 0 stats = stat_files(paths) if ignorefun(stats): if verbose >= 2: print('# ignored:',fullpath) return 0 if (dedupfun == manual_dedup): print('#',fullpath) print_stats(stats) try: dedupfun(fullpath,stats) if not stats: if verbose >= 2: print('# skipped:',fullpath) return 0 if (dedupfun != manual_dedup): if verbose >= 2: print('#',fullpath) if verbose >= 3: print_stats(stats) for (path,stat) in stats: try: if verbose: print('rm -vf',shlex.quote(path)) if execute: os.remove(path) except Exception as e: print('#',e) return total_size(stats) except Exception as e: print(e) return 0 def print_help(): help = \ ''' usage: mergerfs.dedup [] Remove duplicate files across branches of a mergerfs pool. Provides multiple algos for determining which file to keep and what to skip. positional arguments: dir Starting directory optional arguments: -v, --verbose Once to print `rm` commands Twice for status info Three for file info -i, --ignore= Ignore files if... (default: none) * same-size : have the same size * diff-size : have different sizes * same-time : have the same mtime * diff-time : have different mtimes * same-hash : have the same md5sum * diff-hash : have different md5sums * same-short-hash : have the same short md5sums * diff-short-hash : have different short md5sums 'hash' is expensive. 'short-hash' far less expensive, not as safe, but pretty good. -d, --dedup= What file to *keep* (default: mergerfs) * manual : ask user * oldest : file with smallest mtime * newest : file with largest mtime * largest : file with largest size * smallest : file with smallest size * mostfreespace : file on drive with most free space * mergerfs : file selected by the mergerfs getattr policy -s, --strict Skip dedup if all files have same (mtime,size) value. Only applies to oldest, newest, largest, smallest. -e, --execute Will not perform file removal without this. -I, --include= fnmatch compatible filter to include files. Can be used multiple times. -E, --exclude= fnmatch compatible filter to exclude files. Can be used multiple times. -D, --exclude-dir= Directories to exclude from search. Can be used multiple times. ''' print(help) def buildargparser(): desc = 'dedup files across branches in a mergerfs pool' usage = 'mergerfs.dedup [] ' parser = argparse.ArgumentParser(add_help=False) parser.add_argument('dir', type=str, nargs='?', default=None, help='starting directory') parser.add_argument('-v','--verbose', action='count', default=0) parser.add_argument('-i','--ignore', choices=['same-size','diff-size', 'same-time','diff-time', 'same-hash','diff-hash', 'same-short-hash', 'diff-short-hash']) parser.add_argument('-d','--dedup', choices=['manual', 'oldest','newest', 'smallest','largest', 'mostfreespace', 'mergerfs'], default='mergerfs') parser.add_argument('-s','--strict', action='store_true') parser.add_argument('-e','--execute', action='store_true') parser.add_argument('-I','--include', type=str, action='append', default=[]) parser.add_argument('-E','--exclude', type=str, action='append', default=[]) parser.add_argument('-D','--exclude-dir', dest='excludedir', type=str, action='append', default=[]) parser.add_argument('-h','--help', action='store_true') return parser def main(): sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8', errors='backslashreplace', line_buffering=True) sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf8', errors='backslashreplace', line_buffering=True) parser = buildargparser() args = parser.parse_args() if args.help or not args.dir: print_help() sys.exit(0) args.dir = os.path.realpath(args.dir) if not ismergerfs(args.dir): print("%s is not a mergerfs directory" % args.dir) sys.exit(1) dedupfun = get_dedupfun(args.dedup,args.strict) ignorefun = get_ignorefun(args.ignore) verbose = args.verbose execute = args.execute includes = ['*'] if not args.include else args.include excludes = args.exclude total_size = 0 try: for (dirname,dirnames,filenames) in os.walk(args.dir, topdown=True): dirnames[:] = [dirname for dirname in dirnames if dirname not in args.excludedir] for filename in filenames: if match(filename,excludes): continue if not match(filename,includes): continue fullpath = os.path.join(dirname,filename) total_size += dedup(fullpath,verbose,ignorefun,execute,dedupfun) except KeyboardInterrupt: print("# exiting: CTRL-C pressed") except IOError as e: if e.errno == errno.EPIPE: pass else: raise print('# Total savings:',sizeof_fmt(total_size)) sys.exit(0) if __name__ == "__main__": main()