import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import re
import os
import subprocess
import sys

def download_file(url, filename):
    if os.path.exists(filename):
        print(f"{filename} already exists. Skipping download.")
        return
    
    print(f"Downloading {filename} from {url}...")
    try:
        subprocess.run(['wget', '-O', filename, url], check=True)
        print(f"Successfully downloaded {filename}")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading {filename}: {e}")
        sys.exit(1)

def download_data():
    # NCBI Files
    download_file('https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt', 'eukaryotes.txt')
    download_file('https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt', 'prokaryotes.txt')
    
    # UniProt Stats
    download_file('https://ftp.ebi.ac.uk/pub/databases/reference_proteomes/QfO/STATS', 'QfO_STATS')

# 1. Parse UniProt Stats
def parse_qfo_stats(filepath):
    data = []
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"Error: {filepath} not found.")
        return pd.DataFrame()
    
    start_index = 0
    for i, line in enumerate(lines):
        if line.startswith('Proteome_ID'):
            start_index = i + 1
            break
    
    for line in lines[start_index:]:
        if not line.strip():
            continue
        parts = line.split('\t')
        if len(parts) < 8:
            continue
        
        try:
            tax_id = int(parts[1])
            canonical_count_str = parts[2]
            match = re.match(r'(\d+)', canonical_count_str)
            if match:
                canonical_count = int(match.group(1))
            else:
                continue
            
            species_name = parts[-1].strip()
            
            data.append({
                'TaxID': tax_id,
                'Unique_Proteins': canonical_count,
                'UniProt_Name': species_name
            })
        except ValueError:
            continue
            
    return pd.DataFrame(data)

# 2. Load NCBI Data for Genome Size
def load_ncbi_size(filepath):
    try:
        df = pd.read_csv(filepath, sep='\t', on_bad_lines='skip')
        if 'Size (Mb)' not in df.columns or 'TaxID' not in df.columns:
            return None
        
        df = df[['TaxID', 'Size (Mb)', '#Organism/Name']].copy()
        df.rename(columns={'#Organism/Name': 'NCBI_Name'}, inplace=True)
        df['Size (Mb)'] = pd.to_numeric(df['Size (Mb)'], errors='coerce')
        df['TaxID'] = pd.to_numeric(df['TaxID'], errors='coerce')
        df.dropna(subset=['Size (Mb)', 'TaxID'], inplace=True)
        df['TaxID'] = df['TaxID'].astype(int)
        
        # Take median size per TaxID
        df_grouped = df.groupby('TaxID').agg({'Size (Mb)': 'median', 'NCBI_Name': 'first'}).reset_index()
        
        return df_grouped
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        return None

def main():
    # Step 1: Download Data
    download_data()

    # Step 2: Parse UniProt Data
    uniprot_df = parse_qfo_stats('QfO_STATS')
    print(f"Loaded {len(uniprot_df)} records from UniProt STATS")

    # Step 3: Load NCBI Data
    euk_size = load_ncbi_size('eukaryotes.txt')
    prok_size = load_ncbi_size('prokaryotes.txt')
    
    if euk_size is None or prok_size is None:
        print("Failed to load NCBI data.")
        return

    ncbi_df = pd.concat([euk_size, prok_size], ignore_index=True)

    # Step 4: Merge Datasets
    merged_df = pd.merge(uniprot_df, ncbi_df, on='TaxID', how='inner')
    print(f"Merged data has {len(merged_df)} records.")

    # Step 5: Plotting
    plt.figure(figsize=(12, 8))

    euk_taxids = set(euk_size['TaxID'])
    prok_taxids = set(prok_size['TaxID'])

    def get_color(taxid):
        if taxid in euk_taxids: return 'tab:orange', 'Eukaryotes'
        if taxid in prok_taxids: return 'tab:blue', 'Prokaryotes'
        return 'gray', 'Other'

    colors = [get_color(tid)[0] for tid in merged_df['TaxID']]
    labels = [get_color(tid)[1] for tid in merged_df['TaxID']]
    merged_df['Color'] = colors
    merged_df['Label'] = labels

    # Scatter plot
    for label, color in [('Prokaryotes', 'tab:blue'), ('Eukaryotes', 'tab:orange')]:
        subset = merged_df[merged_df['Label'] == label]
        plt.scatter(subset['Size (Mb)'], subset['Unique_Proteins'], 
                    alpha=0.6, s=30, label=label, color=color, edgecolor='k', linewidth=0.5)

    # Highlight Human and Zebrafish
    specific_organisms = {
        9606: 'Homo sapiens', 
        7955: 'Danio rerio'
    }

    for tax_id, name in specific_organisms.items():
        row = merged_df[merged_df['TaxID'] == tax_id]
        if not row.empty:
            r = row.iloc[0]
            plt.scatter(r['Size (Mb)'], r['Unique_Proteins'], color='green', s=150, marker='*', zorder=10)
            plt.annotate(f"{name}\n({int(r['Unique_Proteins'])})", 
                         (r['Size (Mb)'], r['Unique_Proteins']), 
                         xytext=(10, -10), textcoords='offset points', 
                         arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"), 
                         color='green', fontweight='bold')

    # Regression
    log_size = np.log10(merged_df['Size (Mb)'])
    log_genes = np.log10(merged_df['Unique_Proteins'])
    slope, intercept, r_value, p_value, std_err = stats.linregress(log_size, log_genes)

    x_range = np.linspace(merged_df['Size (Mb)'].min(), merged_df['Size (Mb)'].max(), 100)
    y_fit = 10**(intercept + slope * np.log10(x_range))
    plt.plot(x_range, y_fit, 'r--', label=f'Fit: $R^2$ = {r_value**2:.3f}')

    # Outliers
    residuals = log_genes - (intercept + slope * log_size)
    merged_df['residual'] = residuals
    outliers = pd.concat([merged_df.nlargest(3, 'residual'), merged_df.nsmallest(3, 'residual')])

    for idx, row in outliers.iterrows():
        if row['TaxID'] not in specific_organisms: # Don't double label
            plt.text(row['Size (Mb)'], row['Unique_Proteins'], row['UniProt_Name'].split('(')[0].strip(), fontsize=8)

    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('Genome Size (Mb)')
    plt.ylabel('Unique Protein-Coding Genes (Canonical)')
    plt.title('Genome Size vs Unique Protein Count (Quick & Dirty Analysis)')
    plt.figtext(0.5, 0.01, "Note: Unique Proteins (UniProt QfO) vs Genome Size (NCBI median). Mismatched TaxIDs dropped.", ha="center", fontsize=9, bbox={"facecolor":"lightblue", "alpha":0.2, "pad":5})
    plt.legend()
    plt.grid(True, which="both", ls="-", alpha=0.2)

    output_file = 'uniprot_unique_protein_chart.png'
    plt.savefig(output_file, dpi=300)
    print(f"Chart saved to {output_file}")

if __name__ == "__main__":
    main()