#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ dduper - BTRFS Dedupe tool. This is a offline dedupe tool. Instead of reading whole file blocks and computing checksum, It works by fetching checksum from BTRFS csum tree. This hugely improves the performance. """ import argparse import errno import hashlib import numpy as np import math import os import pdb import struct import subprocess import sys from collections import OrderedDict from fcntl import ioctl from itertools import combinations from itertools import zip_longest from stat import * from timeit import default_timer as timer from prettytable import PrettyTable # 4kb block size blk_size = 4 # no.of csum on single row - right now its 8 no_of_chunks = 0 FICLONERANGE = 0x4020940d FIDEDUPERANGE = 0xc0189436 device_name = None skip = False chunk_sz = 0 run_len = 0 ele_sz = 0 fast_mode = False verbose = False analyze = False analyze_dict = OrderedDict() dst_file_sz = 0 # Already deduped files processed_files = [] # From https://stackoverflow.com/questions/434287 def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * int(n) return zip_longest(*args, fillvalue=fillvalue) def get_ele_size(chunk_sz): global no_of_chunks, run_len if chunk_sz <= 0 or chunk_sz % 128 != 0: print("Ensure chunk size is of multiple 128KB. (128,256,512 etc)") sys.exit(-1) no_of_chunks = chunk_sz // blk_size ele_sz = no_of_chunks // 8 run_len = no_of_chunks * blk_size * 1024 # print("\n chunk_sz=%d, no_of_chunks=%d, eke_sz=%d ",chunk_sz, no_of_chunks, ele_sz) return ele_sz def get_hashes(out1): """ For each list item compute its hash and store it with offset as its key. """ global ele_sz ccount = 0 # print("Running with ele_sz " + str(ele_sz)) if ele_sz == 1: od = OrderedDict() for idx, ele in enumerate(out1): v = [] k = hashlib.sha256(str(ele).encode('utf-8')).hexdigest() v.append(idx) if k in od: if verbose is True: print("Collision with: " + str(k) + "at offset: " + str(v)) # Get previous value v.extend(od.get(k)) ccount += 1 od[k] = v else: od = OrderedDict() for idx, ele in enumerate(grouper(out1, ele_sz, 'x')): v = [] k = hashlib.sha256(str(ele).encode('utf-8')).hexdigest() v.append(idx) if k in od: if verbose is True: print("Collision with: " + str(k) + "at offset: " + str(v)) # Get previous value v.extend(od.get(k)) ccount += 1 od[k] = v return od, ccount def ioctl_ficlonerange(dst_fd, s): try: ioctl(dst_fd, FICLONERANGE, s) except Exception as e: print("error({0})".format(e)) def ioctl_fideduperange(src_fd, s): try: v = ioctl(src_fd, FIDEDUPERANGE, s) _,_,_,_,_,_,_,bytes_dup,status,_ = struct.unpack("QQHHIqQQiH",v) return bytes_dup,status except Exception as e: print("error({0})".format(e)) def cmp_files(file1, file2): md1 = subprocess.Popen(['sha256sum', file1], stdout=subprocess.PIPE, close_fds=True ).stdout.read().decode().split(" ")[0] md2 = subprocess.Popen(['sha256sum', file2], stdout=subprocess.PIPE, close_fds=True ).stdout.read().decode().split(" ")[0] if md1 == md2: return 0 else: return 1 def validate_results(src_file, dst_file, bkup_file): ret = cmp_files(dst_file, bkup_file) if ret == 0: print("Dedupe validation successful " + src_file + ":" + dst_file) # Removing temporary backup file path os.unlink(bkup_file) else: msg = "\nFAILURE: Deduplication for " + dst_file + " resulted in corruption." + \ "You can restore original file from " + bkup_file print(msg) with open("/var/log/dduper_backupfile_info.log", "a") as wfd: wfd.write(msg) # TODO: Remove this file from further op def auto_adjust_chunk_sz(src_file_sz, analyze): global chunk_sz, ele_sz # Dont change chunk_sz for --analyze option if analyze is True: return chunk_sz, ele_sz # automatically change chunk size in case of perfect match. fz_kb = src_file_sz >> 10 fz_mb = src_file_sz >> 20 # print("File size is: " + str(fz_kb) + "KB or " + str(fz_mb) + "MB") if fz_mb >= 16: # file >= 16MB set chunk size 16MB perfect_match_chunk_sz = 16384 ele_sz = get_ele_size(1024 * 16) elif fz_mb >= 8: # file >= 8MB set chunk size 8MB perfect_match_chunk_sz = 8192 ele_sz = get_ele_size(1024 * 8) elif fz_mb >= 4: # file >= 4MB set chunk size 4MB perfect_match_chunk_sz = 4096 ele_sz = get_ele_size(1024 * 4) elif fz_mb >= 2: # file >= 2MB set chunk size 2MB perfect_match_chunk_sz = 2048 ele_sz = get_ele_size(1024 * 2) elif fz_mb >= 1: # file >= 1MB set chunk size 1MB perfect_match_chunk_sz = 1024 ele_sz = get_ele_size(1024) elif fz_kb >= 512: # file >= 512KB chunk_size 512KB perfect_match_chunk_sz = 512 ele_sz = get_ele_size(512) else: perfect_match_chunk_sz = 128 ele_sz = get_ele_size(128) return perfect_match_chunk_sz, ele_sz def btrfs_dump_csum(filename): global device_name btrfs_bin = "/usr/sbin/btrfs.static" if os.path.exists(btrfs_bin) is False: btrfs_bin = "btrfs" out = subprocess.Popen( [btrfs_bin, 'inspect-internal', 'dump-csum', filename, device_name], stdout=subprocess.PIPE, close_fds=True).stdout.readlines() return out def do_dedupe(src_file, dst_file, dry_run): global ele_sz, analyze src_file_sz = os.path.getsize(src_file) bkup_file = dst_file + ".__dduper" src_fd = os.open(src_file, os.O_RDONLY) dst_fd = os.open(dst_file, os.O_WRONLY) perfect_match = 0 perfect_match_chunk_sz = 0 out1 = btrfs_dump_csum(src_file) out2 = btrfs_dump_csum(dst_file) # FIXME: check for empty csums assert len(out1) != 0 assert len(out2) != 0 # todo : perfect match files. Remove dst_file from further operations if out1 == out2: print("Perfect match : ", src_file, dst_file) perfect_match = 1 perfect_match_chunk_sz, ele_sz = auto_adjust_chunk_sz(src_file_sz, analyze) # Get hashes now src_dict, src_ccount = get_hashes(out1) dst_dict, dst_ccount = src_dict, src_ccount else: src_dict, src_ccount = get_hashes(out1) dst_dict, dst_ccount = get_hashes(out2) total_entry = len(src_dict) - 1 # Fix missing final ele np1 = np.array([v for v in src_dict.keys()]) np2 = np.array([v for v in dst_dict.keys()]) matched_keys = np.intersect1d(np1, np2) unmatched_keys = np.setdiff1d(np2, np1) total_bytes_deduped = 0 if dry_run is False: # todo: Clear dict/np/list if there are not used further # todo : handle same content within single file if matched_keys is not None: if skip is False: bkup2 = subprocess.Popen( ['cp', '--reflink=always', dst_file, bkup_file], stdout=subprocess.PIPE) print("*" * 24) # print "matched regions" for location in matched_keys: entry = src_dict[location][0] src_len = no_of_chunks * blk_size * 1024 assert src_len <= 16777216 # 16MB limit src_offset = src_dict[location][0] * src_len multi_dst_offsets = dst_dict[location] # list for offset in multi_dst_offsets: dst_offset = offset * src_len if entry == total_entry: # fix final ele src_len = src_file_sz - src_offset # print("matching chunk : src offset:"+str(src_offset) +" src_len="+ str(src_len) +" dest_off="+ str(dst_offset)) if fast_mode is True: s = struct.pack("qQQQ", src_fd, src_offset, src_len, dst_offset) ioctl_ficlonerange(dst_fd, s) total_bytes_deduped += src_len else: bytes_deduped = 0 status = 0 s = struct.pack("QQHHIqQQiH", src_offset, src_len, 1, 0, 0, dst_fd, dst_offset, bytes_deduped, status, 0) bytes_deduped,status = ioctl_fideduperange(src_fd, s) total_bytes_deduped += bytes_deduped #print("\n bytes_deduped= %d %d " % (bytes_deduped, status)) print("Dedupe completed for " + src_file + ":" + dst_file) # Verify original unmodified file and newly deduped file both point to same contents if skip is False: validate_results(src_file, dst_file, bkup_file) # Close open fds os.close(src_fd) os.close(dst_fd) return display_summary(blk_size, chunk_sz, perfect_match_chunk_sz, src_file, dst_file, src_ccount, dst_ccount, total_bytes_deduped, perfect_match, dry_run, src_dict, dst_dict, matched_keys, unmatched_keys) def display_summary(blk_size, chunk_sz, perfect_match_chunk_sz, src_file, dst_file, src_ccount, dst_ccount, total_bytes_deduped, perfect_match, dry_run, src_dict, dst_dict, matched_keys, unmatched_keys): global dst_file_sz if perfect_match == 1: chunk = perfect_match_chunk_sz total_bytes_deduped = dst_file_sz else: chunk = chunk_sz # Compute matched and unmatched chunk info on dst_dict matched_chunks = 0 for k in matched_keys: matched_chunks += len(dst_dict.get(k)) unmatched_chunks = 0 for k in unmatched_keys: unmatched_chunks += len(dst_dict.get(k)) if analyze is False: print("Summary") print("blk_size : %dKB chunksize : %dKB" % (blk_size, chunk)) if src_ccount == 0: print(src_file + " has " + str(len(src_dict)) + " chunks") else: print(src_file + " has " + str(len(src_dict) + src_ccount) + " chunks") if dst_ccount == 0: print(dst_file + " has " + str(len(dst_dict)) + " chunks") else: print(dst_file + " has " + str(len(dst_dict) + dst_ccount) + " chunks") print("Matched chunks: " + str(matched_chunks)) print("Unmatched chunks: " + str(unmatched_chunks)) avail_dedupe = matched_chunks * chunk if dry_run is False: print("Total size(KB) deduped: " + str(total_bytes_deduped // 1024)) elif analyze is False: print("Total size(KB) available for dedupe: %d " % (avail_dedupe)) if dst_file_sz == (avail_dedupe * 1024): # print "whole file deduped, remove this file from further op: " + str(dst_file) # override perfect match to remove this whole file. perfect_match = 1 if analyze is True: if chunk_sz in analyze_dict: v = analyze_dict[chunk_sz] else: v = [] if perfect_match == 1: v.append(( src_file + ":" + dst_file, dst_file_sz // 1024, )) else: v.append(( src_file + ":" + dst_file, avail_dedupe, )) analyze_dict[chunk_sz] = v sys.stdout.write('[Analyzing] %s:%s \r' % (src_file, dst_file)) sys.stdout.flush() return perfect_match def validate_files(src_file, dst_file, processed_files): global dst_file_sz src_stat = os.stat(src_file) dst_stat = os.stat(dst_file) dst_file_sz = dst_stat.st_size global run_len if src_file in processed_files: return False if dst_file in processed_files: return False # Verify it's a unique regular file if (S_ISREG(src_stat.st_mode) == S_ISREG(dst_stat.st_mode) and (src_stat.st_ino != dst_stat.st_ino) and (src_stat.st_size >= 4096) and (dst_stat.st_size >= 4096)): # and (src_stat.st_size >= run_len) # and (dst_stat.st_size >= run_len)): return True if verbose is True: print("Skipped", src_file, dst_file, "not unique regular files or \ file size < 4kb") return False def dedupe_files(file_list, dry_run): ret = 0 global processed_files if len(file_list) == 2: src_file = file_list[0] dst_file = file_list[1] if validate_files(src_file, dst_file, processed_files) is True: ret = do_dedupe(src_file, dst_file, dry_run) elif len(file_list) > 2: comb = combinations(file_list, 2) for f in comb: src_file = f[0] dst_file = f[1] if validate_files(src_file, dst_file, processed_files) is True: # print src_file + " <-Dedupe-> " + dst_file ret = do_dedupe(src_file, dst_file, dry_run) # pdb.set_trace() if ret == 1: # perfectly matching file found or while file content deduped - stop re-use this file again. # print "removing " + str(dst_file) processed_files.append(dst_file) #else: #print src_file + " <-SKIP Dedupe-> " + dst_file + "chunk_size:" + str(chunk_sz) else: print("Single file given or Empty directory. Try again with --recurse") return def validate_file(filename): global run_len if os.path.exists(filename) is False: return False file_stat = os.stat(filename) # Verify its a unique regular file if (S_ISREG(file_stat.st_mode) and (file_stat.st_size >= 4096)): # and (file_stat.st_size >= run_len)): return True else: if verbose is True: print("Skipped", filename, "not unique regular files or \ file size < 4kb ") return False def dedupe_dir(dir_path, dry_run, recurse): file_list = [] if recurse is True: for dirname in dir_path: for path, dirs, files in os.walk(dirname): for filename in files: fn = os.path.join(path, filename) if validate_file(fn) is True: file_list.append(fn) else: for dirname in dir_path: for fi in os.listdir(dirname): if os.path.isfile(os.path.join(dirname, fi)): fn = os.path.join(dirname, fi) if validate_file(fn) is True: file_list.append(fn) dedupe_files(file_list, dry_run) def main(results): if results.file_list is not None: dedupe_files(results.file_list, results.dry_run) if results.dir_path is not None: dedupe_dir(results.dir_path, results.dry_run, results.recurse) return if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-p', '--device', action='store', dest='device_name', type=str, help='Device with BTRFS partition (ex: /dev/sda3) ', required=True) single = parser.add_mutually_exclusive_group() single.add_argument('-d', '--dir', action='store', dest='dir_path', nargs='+', type=str, help='Dedupe given directory or directories', required=False) single.add_argument('-f', '--files', action='store', dest='file_list', nargs='+', help='Dedupe list of files', type=str, required=False) parser.add_argument('-r', '--recurse', action='store_true', dest='recurse', help='Parse dir recursively (used along with -d)') parser.add_argument('-D', '--dry-run', action='store_true', dest='dry_run', help='Show summary of dedupe details') parser.add_argument('-s', '--skip', action='store_true', dest='skip', help='Will skip backup/validation process.') parser.add_argument('-c', '--chunk-size', action='store', dest='chunk_sz', type=int, default=128, help='Dedupe chunk size in KB', required=False) parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.04', help="Show version info") parser.add_argument('-m', '--fast-mode', action='store_true', dest='fast_mode', help='Use ficlonerange call', default=False) parser.add_argument('-V', '--verbose', action='store_true', dest='verbose', help='Show logs messages') parser.add_argument( '-a', '--analyze', action='store_true', dest='analyze', help='Report deduplicate data status with different chunk size') results = parser.parse_args() if not (results.dir_path or results.file_list): parser.error('No action requested, add --files or --dir') device_name = results.device_name skip = results.skip chunk_sz = results.chunk_sz fast_mode = results.fast_mode ele_sz = get_ele_size(chunk_sz) verbose = results.verbose start = timer() if fast_mode is False: skip = True if results.analyze is True: results.dry_run = True analyze = True for sz in 128, 256, 512, 1024, 2048, 4096, 8192, 16384: chunk_sz = sz ele_sz = get_ele_size(chunk_sz) main(results) del processed_files[:] # clear list for next chunk_sz iteration for k, v in analyze_dict.items(): total_sz = 0 table = PrettyTable() table.field_names = ["Chunk Size(KB)", "Files", "Duplicate(KB)"] for v1 in v: table_row = [] table_row.append(k) f, z = v1 total_sz += z table_row.append(str(f)) table_row.append(z) table.add_row(table_row) print(table) print( "dduper:%sKB of duplicate data found with chunk size:%dKB \n\n" % (total_sz, k)) else: main(results) print("dduper took " + str(timer() - start) + " seconds")