#python anamolyDetect.py "Z:\Download\proteinGroups.txt" 51
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import sys
import os
from collections import Counter
import argparse
from scipy.stats import chisquare

def extract_first_digit(value):
    str_value = str(value)
    match = re.search(r'[1-9]', str_value)
    if match:
        return int(match.group())
    return None

def read_and_extract_column(file_path, column_index, has_header=True):
    try:
        if has_header:
            df = pd.read_csv(file_path, sep='\t', low_memory=False)
            if column_index >= len(df.columns):
                raise IndexError(f"Column index {column_index} is out of range. File has {len(df.columns)} columns.")
            column_data = df.iloc[:, column_index].tolist()
            column_name = df.columns[column_index]
        else:
            df = pd.read_csv(file_path, sep='\t', header=None, low_memory=False)
            if column_index >= len(df.columns):
                raise IndexError(f"Column index {column_index} is out of range. File has {len(df.columns)} columns.")
            column_data = df.iloc[:, column_index].tolist()
            column_name = None
        return column_data, column_name
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        sys.exit(1)
    except Exception as e:
        print(f"Error reading file: {e}")
        sys.exit(1)

def create_barplot(first_digits, total_values, output_file, title, column_data):
    digit_counts = Counter(first_digits)
    digits = list(range(1, 10))
    counts = [digit_counts.get(digit, 0) for digit in digits]
    
    # Calculate Benford's Law expected percentages
    benford_expected = [np.log10(1 + (1/digit)) * 100 for digit in digits]
    total_count = sum(counts)
    
    # Calculate expected counts based on Benford's Law
    expected_counts = [(percentage/100) * total_count for percentage in benford_expected]
    
    # Generate uniform random data in same range as original data
    numeric_values = []
    for value in column_data:
        try:
            if pd.notna(value) and str(value).strip() != '':
                num_val = float(str(value))
                if num_val > 0:  # Only positive values for first digit analysis
                    numeric_values.append(num_val)
        except (ValueError, TypeError):
            continue
    
    if numeric_values:
        min_val = min(numeric_values)
        max_val = max(numeric_values)
        # Generate same number of random values as we have valid first digits
        np.random.seed(42)  # For reproducible results
        random_values = np.random.uniform(min_val, max_val, len(first_digits))
        
        # Generate log10-uniform random data (as per Benford's Law theory)
        log_min = np.log10(min_val)
        log_max = np.log10(max_val)
        log_uniform_values = 10 ** np.random.uniform(log_min, log_max, len(first_digits))
        
        # Extract first digits from both types of random values
        random_first_digits = []
        log_uniform_first_digits = []
        for val in random_values:
            first_digit = extract_first_digit(val)
            if first_digit is not None:
                random_first_digits.append(first_digit)
        
        for val in log_uniform_values:
            first_digit = extract_first_digit(val)
            if first_digit is not None:
                log_uniform_first_digits.append(first_digit)
        
        random_digit_counts = Counter(random_first_digits)
        log_uniform_digit_counts = Counter(log_uniform_first_digits)
        random_counts = [random_digit_counts.get(digit, 0) for digit in digits]
        log_uniform_counts = [log_uniform_digit_counts.get(digit, 0) for digit in digits]
    else:
        random_values = []
        log_uniform_values = []
        random_counts = [0] * 9
        log_uniform_counts = [0] * 9
    
    plt.figure(figsize=(16, 8))
    
    # Create bar chart with observed and two types of random values
    x_pos = np.arange(len(digits))
    width = 0.25
    
    bars1 = plt.bar(x_pos - width, counts, width, color='orange', edgecolor='red', alpha=0.7, label='Observed Data')
    bars3 = plt.bar(x_pos, log_uniform_counts, width, color='lightgreen', edgecolor='darkgreen', alpha=0.7, label='Log10-Uniform Random')
    bars4 = plt.bar(x_pos + width, random_counts, width, color='lightcoral', edgecolor='darkred', alpha=0.7, label='Uniform Random')
    
    # Add Benford's Law expected as black dots
    dots = plt.scatter(x_pos, expected_counts, color='black', s=100, zorder=5, alpha=0.4, label='Benford\'s Law Expected')
    
    plt.xlabel('First Digit')
    plt.ylabel('Frequency')
    plt.title(title)
    plt.xticks(x_pos, digits)
    plt.grid(axis='y', alpha=0.3)
    plt.legend()
    
    # Add count and percentage labels on observed bars
    max_height = max(max(counts), max(expected_counts), max(random_counts), max(log_uniform_counts))
    for bar, count in zip(bars1, counts):
        if count > 0:
            percentage = (count / total_count) * 100
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01 * max_height,
                    f'{count}\n({percentage:.1f}%)', ha='center', va='bottom', fontsize=6)
    
    # Add expected percentage labels below black dots
    for i, (exp_count, exp_percent) in enumerate(zip(expected_counts, benford_expected)):
        if exp_count > 0:
            plt.text(x_pos[i], exp_count - 0.03 * max_height,
                    f'{exp_count:.0f}\n({exp_percent:.1f}%)', ha='center', va='top', fontsize=6, color='black', 
                    bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.8))
    
    # Add random percentage labels on random bars
    random_total = sum(random_counts)
    for bar, random_count in zip(bars4, random_counts):
        if random_count > 0 and random_total > 0:
            percentage = (random_count / random_total) * 100
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01 * max_height,
                    f'{random_count}\n({percentage:.1f}%)', ha='center', va='bottom', fontsize=6, color='darkred')
    
    # Add log-uniform percentage labels on log-uniform bars
    log_uniform_total = sum(log_uniform_counts)
    for bar, log_uniform_count in zip(bars3, log_uniform_counts):
        if log_uniform_count > 0 and log_uniform_total > 0:
            percentage = (log_uniform_count / log_uniform_total) * 100
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01 * max_height,
                    f'{log_uniform_count}\n({percentage:.1f}%)', ha='center', va='bottom', fontsize=6, color='darkgreen')
    
    valid_digits = sum(counts)
    random_digits = sum(random_counts)
    log_uniform_digits = sum(log_uniform_counts)
    plt.text(0.98, 0.98, f'Total values: {total_values}\nValid digits: {valid_digits}\nRandom digits: {random_digits}\nLog10-uniform: {log_uniform_digits}', 
             transform=plt.gca().transAxes, verticalalignment='top', horizontalalignment='right',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    print(f"Plot saved to: {output_file}")
    plt.close()
    
    # Create overlaying histogram for observed vs random values
    if numeric_values:
        hist_output_file = output_file.replace('.png', '_histogram.png')
        
        plt.figure(figsize=(12, 6))
        
        # Create histogram with both distributions
        plt.hist(numeric_values, bins=50, alpha=0.7, color='orange', edgecolor='red', 
                label=f'Observed Values (n={len(numeric_values)})', density=True)
        plt.hist(random_values, bins=50, alpha=0.7, color='lightcoral', edgecolor='darkred',
                label=f'Uniform Random (n={len(random_values)})', density=True)
        plt.hist(log_uniform_values, bins=50, alpha=0.7, color='lightgreen', edgecolor='darkgreen',
                label=f'Log10-Uniform Random (n={len(log_uniform_values)})', density=True)
        
        plt.xlabel('Value')
        plt.ylabel('Density')
        plt.title(f'Distribution Comparison: Observed vs Random Sampling\n{title}')
        plt.legend()
        plt.grid(axis='y', alpha=0.3)
        
        # Add statistics text box
        obs_mean = np.mean(numeric_values)
        obs_std = np.std(numeric_values)
        rand_mean = np.mean(random_values)
        rand_std = np.std(random_values)
        log_uniform_mean = np.mean(log_uniform_values)
        log_uniform_std = np.std(log_uniform_values)
        
        stats_text = f'Observed: μ={obs_mean:.2e}, σ={obs_std:.2e}\nUniform: μ={rand_mean:.2e}, σ={rand_std:.2e}\nLog10-Uniform: μ={log_uniform_mean:.2e}, σ={log_uniform_std:.2e}'
        plt.text(0.98, 0.98, stats_text, transform=plt.gca().transAxes, 
                verticalalignment='top', horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
        
        plt.tight_layout()
        plt.savefig(hist_output_file, dpi=300, bbox_inches='tight')
        print(f"Histogram saved to: {hist_output_file}")
        plt.close()
        
        # Create log2-transformed histogram
        log_hist_output_file = output_file.replace('.png', '_log2_histogram.png')
        
        # Apply log2 transformation (adding small epsilon to avoid log(0))
        epsilon = 1e-10
        log_numeric_values = np.log2(np.array(numeric_values) + epsilon)
        log_random_values = np.log2(np.array(random_values) + epsilon)
        log_log_uniform_values = np.log2(np.array(log_uniform_values) + epsilon)
        
        plt.figure(figsize=(12, 6))
        
        # Create log2 histogram with all three distributions
        plt.hist(log_numeric_values, bins=50, alpha=0.7, color='orange', edgecolor='red', 
                label=f'Observed Log2 Values (n={len(log_numeric_values)})', density=True)
        plt.hist(log_random_values, bins=50, alpha=0.7, color='lightcoral', edgecolor='darkred',
                label=f'Uniform Random Log2 (n={len(log_random_values)})', density=True)
        plt.hist(log_log_uniform_values, bins=50, alpha=0.7, color='lightgreen', edgecolor='darkgreen',
                label=f'Log10-Uniform Random Log2 (n={len(log_log_uniform_values)})', density=True)
        
        plt.xlabel('Log2(Value)')
        plt.ylabel('Density')
        plt.title(f'Log2-Transformed Distribution Comparison\n{title}')
        plt.legend()
        plt.grid(axis='y', alpha=0.3)
        
        # Add log2 statistics text box
        log_obs_mean = np.mean(log_numeric_values)
        log_obs_std = np.std(log_numeric_values)
        log_rand_mean = np.mean(log_random_values)
        log_rand_std = np.std(log_random_values)
        log_log_uniform_mean = np.mean(log_log_uniform_values)
        log_log_uniform_std = np.std(log_log_uniform_values)
        
        log_stats_text = f'Observed Log2: μ={log_obs_mean:.2f}, σ={log_obs_std:.2f}\nUniform Log2: μ={log_rand_mean:.2f}, σ={log_rand_std:.2f}\nLog10-Uniform Log2: μ={log_log_uniform_mean:.2f}, σ={log_log_uniform_std:.2f}'
        plt.text(0.98, 0.98, log_stats_text, transform=plt.gca().transAxes, 
                verticalalignment='top', horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
        
        plt.tight_layout()
        plt.savefig(log_hist_output_file, dpi=300, bbox_inches='tight')
        print(f"Log2 histogram saved to: {log_hist_output_file}")
        plt.close()

parser = argparse.ArgumentParser(description='Extract first digits from a column and create bar plot')
parser.add_argument('file_path', nargs='?', default="https://zenodo.org/records/14557756/files/proteinGroups.txt?download=1", 
                    help='Path to tab-separated text file or URL (default: Zenodo proteinGroups.txt)')
parser.add_argument('column_index', type=int, nargs='?', default=51, help='Column index to extract (1-based, default: 51)')
parser.add_argument('--no-header', action='store_true', help='File has no header row')
parser.add_argument('--output', '-o', help='Output file path for saving the plot')
parser.add_argument('--title', '-t', help='Title for the plot')

args = parser.parse_args()

# Handle the case where user wants to use default file with custom column
# Usage: python script.py 52 (means default file, column 52)
if args.file_path.isdigit() and len(sys.argv) == 2:
    args.column_index = int(args.file_path)
    args.file_path = "https://zenodo.org/records/14557756/files/proteinGroups.txt?download=1"

column_index_zero_based = args.column_index - 1
print(f"Reading file: {args.file_path}")
print(f"Extracting column index: {args.column_index} (1-based)")

has_header = not args.no_header
column_data, column_name = read_and_extract_column(args.file_path, column_index_zero_based, has_header)
#column_data, column_name = read_and_extract_column("Z:\\Download\\proteinGroups.txt", 50, )

if args.output:
    output_file = args.output
else:
    if args.file_path.startswith('http'):
        # For URL, use current directory with default filename
        if column_name:
            safe_column_name = re.sub(r'[^\w\-_\.]', '_', column_name)
            output_file = f"proteinGroups_column_{safe_column_name}_first_digit_distribution.png"
        else:
            output_file = f"proteinGroups_column_{args.column_index}_first_digit_distribution.png"
    else:
        # For local file, use same directory as input file
        file_dir = os.path.dirname(args.file_path)
        base_name = os.path.splitext(os.path.basename(args.file_path))[0]
        if column_name:
            safe_column_name = re.sub(r'[^\w\-_\.]', '_', column_name)
            output_file = os.path.join(file_dir, f"{base_name}_column_{safe_column_name}_first_digit_distribution.png")
        else:
            output_file = os.path.join(file_dir, f"{base_name}_column_{args.column_index}_first_digit_distribution.png")

if args.title:
    title = args.title
else:
    if column_name:
        title = f"First Digit Distribution - {column_name}"
    else:
        title = f"First Digit Distribution - Column {args.column_index}"

print(f"Extracted {len(column_data)} values from column {args.column_index}")
if column_name:
    print(f"Column name: {column_name}")

first_digits = []
for value in column_data:
    first_digit = extract_first_digit(value)
    if first_digit is not None:
        first_digits.append(first_digit)
#np.savetxt("Z:\\Download\\proteinGroups_digit1.txt", first_digits)

print(f"Found {len(first_digits)} valid first digits")

if not first_digits:
    print("No valid first digits found in the data!")
    sys.exit(1)

# Generate uniform random data for comparison
numeric_values = []
for value in column_data:
    try:
        if pd.notna(value) and str(value).strip() != '':
            num_val = float(str(value))
            if num_val > 0:  # Only positive values for first digit analysis
                numeric_values.append(num_val)
    except (ValueError, TypeError):
        continue

random_first_digits = []
log_uniform_first_digits = []
if numeric_values:
    min_val = min(numeric_values)
    max_val = max(numeric_values)
    # Generate same number of random values as we have valid first digits
    np.random.seed(42)  # For reproducible results
    random_values = np.random.uniform(min_val, max_val, len(first_digits))
    
    # Generate log10-uniform random data (as per Benford's Law theory)
    log_min = np.log10(min_val)
    log_max = np.log10(max_val)
    log_uniform_values = 10 ** np.random.uniform(log_min, log_max, len(first_digits))
    
    # Extract first digits from both types of random values
    for val in random_values:
        first_digit = extract_first_digit(val)
        if first_digit is not None:
            random_first_digits.append(first_digit)
    
    for val in log_uniform_values:
        first_digit = extract_first_digit(val)
        if first_digit is not None:
            log_uniform_first_digits.append(first_digit)
else:
    random_values = []
    log_uniform_values = []

digit_counts = Counter(first_digits)
random_digit_counts = Counter(random_first_digits)
log_uniform_digit_counts = Counter(log_uniform_first_digits)
print("\nFirst digit distribution:")
print("Digit | Observed | Expected (Benford) | Log10-Uniform | Uniform Random")
print("------|----------|--------------------|--------------|--------------")
for digit in range(1, 10):
    count = digit_counts.get(digit, 0)
    random_count = random_digit_counts.get(digit, 0)
    log_uniform_count = log_uniform_digit_counts.get(digit, 0)
    observed_percent = (count / len(first_digits)) * 100 if first_digits else 0
    random_percent = (random_count / len(random_first_digits)) * 100 if random_first_digits else 0
    log_uniform_percent = (log_uniform_count / len(log_uniform_first_digits)) * 100 if log_uniform_first_digits else 0
    benford_percent = np.log10(1 + (1/digit)) * 100
    benford_count = int((benford_percent/100) * len(first_digits))
    print(f"  {digit}   |   {count:4d}    |    {benford_count:4d}          |   {log_uniform_count:4d}        |   {random_count:4d}")
    print(f"      | ({observed_percent:.1f}%)   |   ({benford_percent:.1f}%)        | ({log_uniform_percent:.1f}%)      | ({random_percent:.1f}%)")

# Perform chi-square test comparing observed vs Benford's Law expected
observed_counts = [digit_counts.get(digit, 0) for digit in range(1, 10)]
expected_percents = [np.log10(1 + (1/digit)) * 100 for digit in range(1, 10)]
expected_counts = [(percent/100) * len(first_digits) for percent in expected_percents]

# Alternative approach: Use percentages directly (might match Excel better)
observed_percents = [(count / len(first_digits)) * 100 for count in observed_counts]

# Method 1: Chi-square test using counts (scipy standard)
chi2_stat, p_value = chisquare(observed_counts, expected_counts)

# Method 2: Chi-square test using percentages directly
# This normalizes both observed and expected to percentages
chi2_percent = sum(((obs_pct - exp_pct)**2) / exp_pct for obs_pct, exp_pct in zip(observed_percents, expected_percents))

print(f"\nChi-square goodness-of-fit test (observed vs Benford's Law):")
print(f"Using counts: {chi2_stat:.6f}, p-value: {p_value:.6f}")

# Calculate p-value for percentage method manually using scipy.stats
from scipy.stats import chi2
p_value_percent = 1 - chi2.cdf(chi2_percent, len(observed_counts) - 1)
print(f"Using percentages: {chi2_percent:.6f}, p-value: {p_value_percent:.6f}")

# Additional tests for random data comparisons
if random_first_digits:
    random_counts = [random_digit_counts.get(digit, 0) for digit in range(1, 10)]
    random_percents = [(count / len(random_first_digits)) * 100 for count in random_counts]
    
    # Method 1: Standard counts
    chi2_random, p_random = chisquare(random_counts, expected_counts)
    
    # Method 2: Percentages directly
    chi2_random_percent = sum(((rand_pct - exp_pct)**2) / exp_pct for rand_pct, exp_pct in zip(random_percents, expected_percents))
    p_random_percent = 1 - chi2.cdf(chi2_random_percent, len(random_counts) - 1)
    
    print(f"\nUniform random vs Benford's Law:")
    print(f"Using counts: Chi-square: {chi2_random:.4f}, p-value: {p_random:.6f}")
    print(f"Using percentages: Chi-square: {chi2_random_percent:.4f}, p-value: {p_random_percent:.6f}")

if log_uniform_first_digits:
    log_uniform_counts = [log_uniform_digit_counts.get(digit, 0) for digit in range(1, 10)]
    log_uniform_percents = [(count / len(log_uniform_first_digits)) * 100 for count in log_uniform_counts]
    
    # Method 1: Standard counts
    chi2_log, p_log = chisquare(log_uniform_counts, expected_counts)
    
    # Method 2: Percentages directly
    chi2_log_percent = sum(((log_pct - exp_pct)**2) / exp_pct for log_pct, exp_pct in zip(log_uniform_percents, expected_percents))
    p_log_percent = 1 - chi2.cdf(chi2_log_percent, len(log_uniform_counts) - 1)
    
    print(f"Log10-uniform vs Benford's Law:")
    print(f"Using counts: Chi-square: {chi2_log:.4f}, p-value: {p_log:.6f}")
    print(f"Using percentages: Chi-square: {chi2_log_percent:.4f}, p-value: {p_log_percent:.6f}")

create_barplot(first_digits, len(column_data), output_file, title, column_data)