# -*- coding: utf-8 -*- """ Village Directory Comparison Tool Function: Compare columns B and D when columns A and C are identical, with configurable character difference threshold. """ import pandas as pd import argparse import os def levenshtein_distance(s1, s2): """ Calculate the edit distance (Levenshtein distance) between two strings. Edit distance represents the minimum number of single-character edits required to transform one string into another. """ if s1 is None or s2 is None: return float('inf') s1 = str(s1) s2 = str(s2) if len(s1) < len(s2): s1, s2 = s2, s1 if len(s2) == 0: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): # Cost of insertion, deletion, substitution insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] def compare_villages(input_file, output_file, threshold): """ Compare village data Parameters: input_file: Input Excel file path output_file: Output Excel file path threshold: Allowed character difference threshold """ print(f"=" * 60) print(f"Village Directory Comparison Tool") print(f"=" * 60) print(f"Input file: {input_file}") print(f"Output file: {output_file}") print(f"Difference threshold: {threshold} (allows {threshold} character differences between B and D columns)") print(f"=" * 60) # Read Excel file df = pd.read_excel(input_file, header=None) # Get actual column names (row 2, index 1) col_names = df.iloc[1].tolist() # Extract data (starting from row 3, index 2) data = df.iloc[2:].copy() data.columns = ['A_Administrative', 'B_Name', 'C_Administrative', 'D_Name'] data = data.reset_index(drop=True) print(f"\nData Overview:") print(f" - Millennium Village sample count: {data['B_Name'].notna().sum()}") print(f" - Traditional Village directory count: {data['D_Name'].notna().sum()}") # Store match results matches = [] # Get all Millennium Village data (columns A and B), deduplicated millennium_villages = data[['A_Administrative', 'B_Name']].dropna(subset=['B_Name']) millennium_villages = millennium_villages.drop_duplicates() # Get all Traditional Village data (columns C and D), deduplicated traditional_villages = data[['C_Administrative', 'D_Name']].dropna(subset=['D_Name']) traditional_villages = traditional_villages.drop_duplicates() print(f"\nStarting comparison...") print(f" - Comparison condition: Column A matches Column C exactly") print(f" - Match condition: Edit distance between B and D <= {threshold}") # Iterate through Millennium Villages for idx1, row1 in millennium_villages.iterrows(): a_val = row1['A_Administrative'] b_val = row1['B_Name'] # Find records in Traditional Villages where column A matches for idx2, row2 in traditional_villages.iterrows(): c_val = row2['C_Administrative'] d_val = row2['D_Name'] # Check if columns A and C are identical if str(a_val).strip() == str(c_val).strip(): # Calculate edit distance between columns B and D distance = levenshtein_distance(b_val, d_val) # If edit distance is within threshold, record as match if distance <= threshold: matches.append({ 'millennium_village_administrative': a_val, 'millennium_village_name': b_val, 'traditional_village_administrative': c_val, 'traditional_village_name': d_val, 'character_difference': distance, 'exact_match': 'Yes' if distance == 0 else 'No' }) # Create result DataFrame result_df = pd.DataFrame(matches) # Statistics total_matches = len(result_df) exact_matches = len(result_df[result_df['character_difference'] == 0]) if total_matches > 0 else 0 fuzzy_matches = total_matches - exact_matches print(f"\nComparison Results:") print(f" - Total matches: {total_matches}") print(f" - Exact matches: {exact_matches}") print(f" - Fuzzy matches: {fuzzy_matches} (1-{threshold} character differences)") # Save results if total_matches > 0: # Sort by character difference result_df = result_df.sort_values(by=['character_difference', 'millennium_village_administrative']) result_df.to_excel(output_file, index=False, engine='openpyxl') print(f"\nResults saved to: {output_file}") # Display match details print(f"\nMatch Details:") print("-" * 80) for _, row in result_df.iterrows(): match_type = "Exact Match" if row['character_difference'] == 0 else f"Fuzzy Match ({row['character_difference']} char diff)" print(f" [{match_type}]") print(f" Millennium Village: {row['millennium_village_administrative']} - {row['millennium_village_name']}") print(f" Traditional Village: {row['traditional_village_administrative']} - {row['traditional_village_name']}") print("-" * 80) else: print(f"\nNo matching records found") # Still save empty result file result_df.to_excel(output_file, index=False, engine='openpyxl') return result_df def main(): parser = argparse.ArgumentParser( description='Village Directory Comparison Tool - Compare village names with fuzzy matching', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=''' Examples: python compare_villages.py -i data.xlsx python compare_villages.py -i data.xlsx -o result.xlsx python compare_villages.py -i data.xlsx -o result.xlsx -t 1 ''' ) parser.add_argument( '-i', '--input', required=True, help='Input Excel file path (required)' ) parser.add_argument( '-o', '--output', default=None, help='Output Excel file path (default: comparison_result.xlsx)' ) parser.add_argument( '-t', '--threshold', type=int, default=2, help='Character difference threshold (default: 2)' ) args = parser.parse_args() # Resolve input file path input_file = os.path.abspath(args.input) # Check if input file exists if not os.path.exists(input_file): print(f"Error: Input file does not exist: {input_file}") return 1 # Resolve output file path if args.output: output_file = os.path.abspath(args.output) else: # Generate default output filename with threshold output_file = os.path.join( os.path.dirname(input_file), f'comparison_result_t{args.threshold}.xlsx' ) # Execute comparison compare_villages(input_file, output_file, args.threshold) return 0 if __name__ == '__main__': exit(main())