#!/usr/bin/env python # SPDX-License-Identifier: GPL-2.0-or-later # Copyright (c) 2022 Mike Fleetwood # FILE: find-overlap.py """find-overlap.py - Find overlapping range of a file system Command line tool which reads a file system (or any other data) and finds the overlapping range after an interrupted GParted resize/move. Works by computing the MD5 hash of every block and finding blocks with a duplicate copy to identify the overlap size and location. Takes as long to run as reading the named device. Command line usage: find-overlap.py [DEVICE] """ import argparse import hashlib import sys from collections import namedtuple PROGNAME = 'find-overlap.py' BLOCKSIZE = 1024*1024 dump_hashes_fname = None def read_hashes(f): """Return list of MD5 hashes for all blocks read from the open file object. """ md5_hashes = [] while True: data = f.read(BLOCKSIZE) if not data: break md5_hash = hashlib.md5(data).digest() md5_hashes.append(md5_hash) return md5_hashes def generate_matching_hashes(md5_hashes): """Return a dictionary with MD5 hash as the key to the list of blocks numbers with with the same hash from the list of MD5 hashes. Example: >>> generate_matching_hashes(['#0', '#0', '#2']) {'#0': [0, 1], '#2': [2]} Read the returned dictionary as key '#0' is the hash for data blocks 0 and 1, and key '#2' is the hash for block 2. """ matching_hashes = {} blknum = 0 for md5_hash in md5_hashes: if md5_hash in matching_hashes: matching_hashes[md5_hash].append(blknum) else: matching_hashes[md5_hash] = [blknum] blknum += 1 return matching_hashes def eliminate_non_duplicates(matching_hashes): """Mutate the passed matching hashes dictionary deleting items (MD5 hash keys and block number lists) of unique blocks and blocks with more than 4 copies, as neither contribute to finding the overlapping range. Example: >>> matching_hashes = {'#0': [0], '#1': [1, 2], '#2': [3], '#3': [4, 5, 6], '#4': [7, 8, 9, 10], '#5': [11, 12, 13, 14, 15]} >>> eliminate_non_duplicates(matching_hashes) >>> matching_hashes {'#1': [1, 2], '#3': [4, 5, 6], '#4': [7, 8, 9, 10]} """ # Real case test-hashes-f18089.txt.xz had: # Blocks: 748453 (1 MiB each) # Overlap size: 19585 # within overlap range # Unique blocks: 253290 # Matching pairs: 19756 19585 ( 99.1%) # Matching triples: 6 0 ( 0.0%) # Matching quadruples: 0 # # Test case test-hashes-t1.txt.xz had: # (Ext4 FS with 2 copies of 4.8 GiB of files and partial overlapping # copy performed) # Blocks: 20480 (1 MiB each) # Overlap size: 4096 # within overlap range # Unique blocks: 76 # Matching pairs: 1043 1043 (100.0%) # Matching triples: 3665 3665 (100.0%) # Matching quadruples: 0 # # Therefore keep matching blocks up to a count of 4 replicas to # allow for file systems containing multiple copies of files covered # by the overlapping range. for key, value in list(matching_hashes.items()): if len(value) < 2 or len(value) > 4: del matching_hashes[key] def compute_offset_blocks(matching_hashes): """Return a new dictionary keyed by the offset between two blocks with the same MD5 hash which looks up the list of the first of each pair of matched blocks. The second matching block is simply the first block number plus the offset used as the dictionary key. Example: >>> compute_offset_blocks({'#0': [1, 8], '#1': [10, 13], '#2': [11, 14], '#3': [12, 15], '#4': [20, 21, 22], '#5': [25, 26, 27, 28]}) {1: [20, 21, 25, 26, 27], 2: [20, 25, 26], 3: [10, 11, 12, 25], 7: [1]} Read the returned dictionary as saying blocks [20, 21, 25, 26, 27] have a partner with the same MD5 hash at offset 1, namely blocks [21, 22, 26, 27, 28] respectively, etc and finally block [1] has a partner with the same MD5 hash at offset 7. """ offset_blocks = {} for blknums in matching_hashes.values(): blknums_copy = list(blknums) while len(blknums_copy) >= 2: fst_dup_blknum = blknums_copy[0] blknums_copy.pop(0) for snd_dup_blknum in blknums_copy: offset = snd_dup_blknum - fst_dup_blknum if offset in offset_blocks: offset_blocks[offset].append(fst_dup_blknum) else: offset_blocks[offset] = [fst_dup_blknum] # Above iteration of matching_hashes dict values is probably in the # key (MD5 hash) order and definitely not in increasing block number # order. Sort the lists of duplicate block numbers (first block # number of a duplicate pair). for key in offset_blocks.keys(): offset_blocks[key].sort() return offset_blocks def find_start_matching_block(start, offset, md5_hashes): """Return starting block which still matches at offset From the start block, which is assumed to match the MD5 hash of block at start + offset, search backwards for the earliest block which still matches at block + offset. """ while start > 0: test = start - 1 if md5_hashes[test] != md5_hashes[test+offset]: break start = test return start def find_stop_matching_block(stop, offset, md5_hashes): """Return first block which no longer matches at offset From the stop block, search forwards finding the first block which no longer matches at block + offset. find_start_matching_block() and find_stop_matching_block() are a pair which find the Python slicing range [start:stop] of an overlapping range of blocks. """ while stop + offset < len(md5_hashes): if md5_hashes[stop] != md5_hashes[stop+offset]: break stop += 1 return stop def compute_candidate_ranges(offset_blocks, md5_hashes): """Return list of candidate overlapping ranges From the offset blocks dictionary compute a list of candidate overlapping ranges. Assumes that for each offset in the offset blocks dictionary there will only be one significant set of matching blocks, that of the overlap being searched for, and that the median (middle) block from the list is within that range. (Alternatively, what is the chance that your file system including it's metadata just happens to have a set of blocks that exactly mimics the overlapping range being searched for. *Incredibly* low for larger offset / overlap size). The rank is calculated as the size of the matching range divided by the size of the offset. (A rank of 1.0 indicates the matching range exactly equals the offset and a valid overlapping data range has been found). """ candidate_ranges = [] Candidate = namedtuple('Candidate', ['offset', 'start_block', 'stop_block', 'total_blocks', 'rank']) for offset, blocks in offset_blocks.items(): median_index = int(round((len(blocks)) / 2)) start_block = find_start_matching_block(blocks[median_index], offset, md5_hashes) stop_block = find_stop_matching_block(blocks[median_index], offset, md5_hashes) matching_size = stop_block - start_block rank = float(matching_size) / float(offset) candidate_ranges.append(Candidate(offset, start_block, stop_block, len(md5_hashes), rank)) candidate_ranges.sort(key=lambda c: c.rank, reverse=True) return candidate_ranges def candidate_is_full_range(candidate_range): """Return true if the size of matching range is the size of the offset Allow the matching range to be 1 block less in case of a failure to copy one block. """ matching_size = candidate_range.stop_block - candidate_range.start_block return matching_size + 1 >= candidate_range.offset def candidate_range_is_large_enough(candidate_range): """Exclude offsets of 2 or less because accepting groups of up to 4 block allows 4 blocks in a row with the same MD5 hash to be found as an overlapping data range of 2 blocks. """ matching_size = candidate_range.stop_block - candidate_range.start_block return matching_size > 2 def dump_hashes(fname, md5_hashes, matching_hashes): """Write substitute hashes to the named hashes dump file The 128-bit (8 byte) binary MD5 hashes are substituted by '#%d' where '%d' is the first block number with that hash. Each substitute hash is written on a separate line. Start of an example hash dump file: #0 #1 #2 """ try: f = open(fname, mode='w') except IOError as e: sys.exit(PROGNAME + ': ' + str(e)) for md5_hash in md5_hashes: f.write('#%d\n' % (matching_hashes[md5_hash][0])) f.close() def find_overlap_from_hashes(md5_hashes): """Return list of validated overlapping ranges from list of MD5 hashes """ matching_hashes = generate_matching_hashes(md5_hashes) if dump_hashes_fname: dump_hashes(dump_hashes_fname, md5_hashes, matching_hashes) eliminate_non_duplicates(matching_hashes) offset_blocks = compute_offset_blocks(matching_hashes) candidate_ranges = compute_candidate_ranges(offset_blocks, md5_hashes) candidate_ranges = list(filter(candidate_is_full_range, candidate_ranges)) candidate_ranges = list(filter(candidate_range_is_large_enough, candidate_ranges)) return candidate_ranges def print_overlap(cr): """Print one overlapping range""" print('Overlap of size %d blocks found.' % (cr.offset)) print('Range [%d:%d) overlaps [%d:%d).' % (cr.start_block, cr.stop_block, cr.start_block + cr.offset, cr.stop_block + cr.offset)) print ('Original file system size was %d blocks.' % (cr.total_blocks - cr.offset)) median_block = int(round((cr.start_block + cr.stop_block) / 2)) print('Restore original file system with:') print(' dd if=INPUT bs=%d count=%d of=OUTPUT' % (BLOCKSIZE, median_block)) print(' dd if=INPUT bs=%d skip=%d seek=%d of=OUTPUT' % (BLOCKSIZE, median_block + cr.offset, median_block)) def print_overlap_output(candidate_ranges): """Print the list of overlapping ranges""" if len(candidate_ranges) == 0: print('No overlapping range found') return print('Block size: %d bytes' % (BLOCKSIZE)) if len(candidate_ranges) > 1: print('WARNING: Multiple overlapping ranges found') for cr in candidate_ranges: print('') print_overlap(cr) def find_overlap_from_open_hashes_file(f): """Read hashes from previous dump file, search for the overlapping range and print the results """ md5_hashes = f.read().splitlines() candidate_ranges = find_overlap_from_hashes(md5_hashes) print_overlap_output(candidate_ranges) def find_overlap_from_open_file(f): """Search for the overlapping range and print the findings""" md5_hashes = read_hashes(f) candidate_ranges = find_overlap_from_hashes(md5_hashes) print_overlap_output(candidate_ranges) def main(args=None): """Parse command line arguments and calls the function to search for the overlapping range in the named device or stdin """ global dump_hashes_fname parser = argparse.ArgumentParser(description=""" Find overlapping portion of a file system after an interrupted GParted resize/move.""") parser.add_argument('--read-hashes', dest='read_hashes_fname', metavar='DUMP_FILE', help=""" Read previously saved hashes from this file instead of reading from stdin or named device""") parser.add_argument('--dump-hashes', dest='dump_hashes_fname', metavar='DUMP_FILE', help='Write hashes to this file') parser.add_argument('device', nargs='?', help=""" optional device or file to read""") args = parser.parse_args(args) dump_hashes_fname = args.dump_hashes_fname if args.read_hashes_fname: try: f = open(args.read_hashes_fname, mode='r') except IOError as e: return PROGNAME + ': ' + str(e) find_overlap_from_open_hashes_file(f) f.close() elif args.device: try: f = open(args.device, mode='rb') except IOError as e: return PROGNAME + ': ' + str(e) find_overlap_from_open_file(f) f.close() else: find_overlap_from_open_file(sys.stdin) if __name__ == '__main__': sys.exit(main())