# /// script
# requires-python = ">=3.10"
# dependencies = [
#   "pandas",
#   "numpy",
#   "seaborn",
#   "scikit-learn",
#   "scipy",
#   "requests",
#   "matplotlib",
#   "tabulate",
#   "wordcloud",
#   "tenacity"
# ]
# ///

"""
Imports
"""

from typing import Optional
from PIL import Image
import io
import os
import sys
import json
from typing import Tuple, Optional
from wordcloud import WordCloud
from tenacity import retry, stop_after_attempt

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from scipy.stats import skew, kurtosis, chi2_contingency


AIPROXY_TOKEN = os.getenv("AIPROXY_TOKEN")


def load_data(file_path: str) -> pd.DataFrame:
    """
    Tries to load a CSV file with various encodings to handle different character encodings.

    :param file_path: Path to the CSV file.
    :return: Pandas DataFrame containing the dataset.
    """
    encodings_to_try = [
        'utf8', 'latin1', 'ascii', 'us-ascii', 'big5', 'big5-tw', 'csbig5', 'big5hkscs', 'big5-hkscs',
        'cp037', 'IBM037', 'IBM039', 'cp273', 'IBM273', 'cp424', 'EBCDIC-CP-HE', 'IBM424',
        'cp437', 'IBM437', 'cp500', 'EBCDIC-CP-BE', 'EBCDIC-CP-CH', 'IBM500', 'cp720',
        'cp737', 'cp775', 'IBM775', 'cp850', 'IBM850', 'cp852', 'IBM852', 'cp855', 'IBM855',
        'cp856', 'cp857', 'IBM857', 'cp858', 'IBM858', 'cp860', 'IBM860', 'cp861', 'IBM861',
        'cp862', 'IBM862', 'cp863', 'IBM863', 'cp864', 'IBM864', 'cp865', 'IBM865', 'cp866',
        'IBM866', 'cp869', 'IBM869', 'cp874', 'cp875', 'cp932', 'ms932', 'ms-kanji', 'windows-31j',
        'cp949', 'ms949', 'uhc', 'cp950', 'ms950', 'cp1006', 'cp1026', 'ibm1026', 'cp1125',
        'ibm1125', 'cp1140', 'ibm1140', 'cp1250', 'windows-1250', 'cp1251', 'windows-1251',
        'cp1252', 'windows-1252', 'cp1253', 'windows-1253', 'cp1254', 'windows-1254', 'cp1255',
        'windows-1255', 'cp1256', 'windows-1256', 'cp1257', 'windows-1257', 'cp1258', 'windows-1258',
        'euc_jp', 'eucjp', 'ujis', 'u-jis', 'euc_jis_2004', 'jisx0213', 'eucjis2004', 'euc_jisx0213',
        'eucjisx0213', 'euc_kr', 'euckr', 'korean', 'ks_c-5601', 'ks_c-5601-1987', 'ksx1001',
        'ks_x-1001', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', 'iso2022jp', 'iso-2022-jp',
        'iso2022_jp_1', 'iso2022jp-1', 'iso2022_jp_2', 'iso2022jp-2', 'iso-2022-jp-2', 'iso2022_jp_2004',
        'iso2022jp-2004', 'iso-2022-jp-2004', 'iso2022_jp_3', 'iso2022jp-3', 'iso-2022-jp-3', 'iso2022_jp_ext',
        'iso2022jp-ext', 'iso-2022-jp-ext', 'iso2022_kr', 'iso2022kr', 'iso-2022-kr',
        'iso-8859-1', 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1', 'iso8859_2', 'iso-8859-2',
        'latin2', 'L2', 'iso8859_3', 'iso-8859-3', 'latin3', 'L3', 'iso8859_4', 'iso-8859-4', 'latin4',
        'L4', 'iso8859_5', 'iso-8859-5', 'cyrillic', 'iso8859_6', 'iso-8859-6', 'arabic', 'iso8859_7',
        'iso-8859-7', 'greek', 'greek8', 'iso8859_8', 'iso-8859-8', 'hebrew', 'iso8859_9', 'iso-8859-9',
        'latin5', 'L5', 'iso8859_10', 'iso-8859-10', 'latin6', 'L6', 'iso8859_11', 'iso-8859-11', 'thai',
        'iso8859_13', 'iso-8859-13', 'latin7', 'L7', 'iso8859_14', 'iso-8859-14', 'latin8', 'L8', 'iso8859_15',
        'iso-8859-15', 'latin9', 'L9', 'iso8859_16', 'iso-8859-16', 'latin10', 'L10', 'johab', 'cp1361',
        'ms1361', 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'kz_1048', 'strk1048_2002', 'rk1048', 'mac_cyrillic',
        'maccyrillic', 'mac_greek', 'macgreek', 'mac_iceland', 'maciceland', 'mac_latin2', 'maclatin2',
        'maccentraleurope', 'mac_centeuro', 'mac_roman', 'macroman', 'macintosh', 'mac_turkish', 'macturkish',
        'ptcp154', 'csptcp154', 'pt154', 'cp154', 'cyrillic-asian', 'shift_jis', 'csshiftjis', 'shiftjis',
        'sjis', 's_jis', 'shift_jis_2004', 'shiftjis2004', 'sjis_2004', 'sjis2004', 'shift_jisx0213',
        'shiftjisx0213', 'sjisx0213', 's_jisx0213', 'utf_32', 'U32', 'utf32', 'utf_32_be', 'UTF-32BE',
        'utf_32_le', 'UTF-32LE', 'utf_16', 'U16', 'utf16', 'utf_16_be', 'UTF-16BE', 'utf_16_le', 'UTF-16LE',
        'utf_7', 'U7', 'unicode-1-1-utf-7', 'utf_8', 'U8', 'UTF', 'cp65001', 'utf_8_sig'
    ]

    for encoding in encodings_to_try:
        try:
            data = pd.read_csv(file_path, encoding=encoding)
            print(f"Successfully loaded data with {encoding} encoding.")
            return data
        except UnicodeDecodeError:
            print(f"""Failed to load data with {
                  encoding} encoding. Trying next...""")
        except Exception as e:
            print(f"An error occurred with {encoding} encoding: {e}")
            continue

    # We can't continue if we reach here
    raise NotImplementedError(
        "Failed to load the CSV file with all attempted encodings.")


def summarize_data(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
    """
    Summarizes the dataset by providing numerical and categorical summary statistics,
    and counting missing values.

    :param data: Pandas DataFrame containing the dataset.
    :return: A tuple containing:
             - Numerical summary (DataFrame)
             - Categorical summary (DataFrame)
             - Missing values (Series)
    """
    numerical_summary = data.describe().T[['mean', '50%', 'std', 'min', 'max']]
    categorical_summary = data.select_dtypes(
        include=['object']).describe().T[['top', 'freq']]
    missing_values = data.isnull().sum()

    return numerical_summary, categorical_summary, missing_values


def generate_visualizations(data: pd.DataFrame, file_path: str) -> str:
    """
    Generates visualizations (e.g., correlation heatmap) based on the dataset.

    :param data: Pandas DataFrame containing the dataset.
    :param file_path: Path to the CSV file.
    :return: Path to correlation matrix
    """
    dataset_name = os.path.splitext(os.path.basename(file_path))[0]
    folder_path = os.path.join(os.getcwd(), dataset_name)
    os.makedirs(folder_path, exist_ok=True)

    corr_path = os.path.join(folder_path, "correlation_matrix.png")
    plt.figure(figsize=(10, 8))
    numerical_cols = data.select_dtypes(include=['number']).columns
    if numerical_cols.shape[0] > 1:
        correlation_matrix = data[numerical_cols].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
        plt.title("Correlation Matrix")
        plt.savefig(corr_path)
        plt.close()

    return corr_path


@retry(stop=stop_after_attempt(3))
def analyze_with_llm(filename: str, api_key: str) -> Optional[str]:
    """
    Analyzes the dataset using an LLM via a proxy API and returns string in markdown format.

    :param filename: Path to the CSV dataset.
    :param api_key: OpenAI API key for authenticating the request.
    :return: string with the LLM's analysis results, or None if the request fails.
    """

    data = load_data(filename)
    numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = data.select_dtypes(
        include=['object', 'category']).columns.tolist()
    time_series_cols = [
        col for col in data.columns if pd.api.types.is_datetime64_any_dtype(data[col])]
    text_cols = [
        col for col in categorical_cols if data[col].str.len().mean() > 50]
    column_info = {col: str(data[col].dtype) for col in data.columns}

    dataset_summary = data_summary(
        data, numerical_cols, categorical_cols, time_series_cols, text_cols)

    url = "https://aiproxy.sanand.workers.dev/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    data_prompt = f"""
    Analyze the dataset from the provided CSV file. Below is the summary information:

    Dataset Summary:
    Columns: {', '.join(data.columns)}
    Column Information (Data Types): {json.dumps(column_info, indent=2)}
    Summary Statistics (Key Insights):
    {dataset_summary}

    Based on this, please:
    1. Identify key trends and patterns.
    2. Point out any potential outliers or anomalies.
    3. Suggest any potential insights or analyses that might be valuable.
    4. Provide any other interesting observations from the dataset.

    Please return your findings in a MARKDOWN format, you can use html in it to further beautify it,
    highlighting the trends, insights, and any notable outliers or anomalies.
    """

    data_to_send = {
        "model": "gpt-4o-mini",
        "response_format": {"type": "text"},
        "messages": [
            {
                "role": "system",
                "content": """You are an intelligent data analyst capable of providing insights from datasets.
                    You provide your insights in the form of a story which is very captivating.
                    You always support your claims with data. You only claim when you have data to back it up.
                    You are very professional and well versed in providing insights in comprehensive fashion.
                    You always return your findings in a MARKDOWN format you can use html in it to further beautify it.
                    You will never add placeholder images or dummy images"""
            },
            {
                "role": "user",
                "content": data_prompt
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data_to_send)

    if response.status_code == 200:
        result = response.json()
        analysis = result["choices"][0]["message"]["content"]
        print("Done Analysis")
        return analysis
    else:
        print(response.text)
        print(f"Error: {response.status_code}")
        raise Exception("Didn't work")


@retry(stop=stop_after_attempt(7))
def analyze_and_generate_graphs(data: pd.DataFrame, api_key: str) -> None:
    """
    Analyzes the dataset, identifies data types, and dynamically calls functions to generate graphs
    via function calling using an LLM.

    :param data: Pandas Dataframe.
    :param api_key: OpenAI API key for authenticating the request.
    """

    numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = data.select_dtypes(
        include=['object', 'category']).columns.tolist()
    time_series_cols = [
        col for col in data.columns if pd.api.types.is_datetime64_any_dtype(data[col])]
    text_cols = [
        col for col in categorical_cols if data[col].str.len().mean() > 50]

    # Descriptive Statistics for Numerical Columns
    dataset_summary = data_summary(
        data, numerical_cols, categorical_cols, time_series_cols, text_cols)

    functions = [
        {
            "name": "generate_numerical_charts",
            "description": "Generate histograms, box plots, and correlation heatmaps for numerical data.",
            "parameters": {
                "type": "object",
                "properties": {
                    "numerical_cols": {
                        "type": "array",
                        "description": "List of numerical column names.",
                        "items": {"type": "string"}
                    },
                    "output_folder": {
                        "type": "string",
                        "description": "Folder path to save the generated charts."
                    }
                },
                "required": ["numerical_cols", "output_folder"]
            }
        },
        {
            "name": "generate_categorical_charts",
            "description": "Generate bar plots for categorical data.",
            "parameters": {
                "type": "object",
                "properties": {
                    "categorical_cols": {
                        "type": "array",
                        "description": "List of categorical column names.",
                        "items": {"type": "string"}
                    },
                    "output_folder": {
                        "type": "string",
                        "description": "Folder path to save the generated charts."
                    }
                },
                "required": ["categorical_cols", "output_folder"]
            }
        },
        {
            "name": "generate_time_series_charts",
            "description": "Generate line plots for time-series data.",
            "parameters": {
                "type": "object",
                "properties": {
                    "time_series_cols": {
                        "type": "array",
                        "description": "List of time-series column names.",
                        "items": {"type": "string"}
                    },
                    "output_folder": {
                        "type": "string",
                        "description": "Folder path to save the generated charts."
                    }
                },
                "required": ["time_series_cols", "output_folder"]
            }
        },
        {
            "name": "generate_text_charts",
            "description": "Generate word clouds for text data.",
            "parameters": {
                "type": "object",
                "properties": {
                    "text_cols": {
                        "type": "array",
                        "description": "List of text column names.",
                        "items": {"type": "string"}
                    },
                    "output_folder": {
                        "type": "string",
                        "description": "Folder path to save the generated charts."
                    }
                },
                "required": ["text_cols", "output_folder"]
            }
        },
        {
            "name": "generate_geospatial_charts",
            "description": "Generate scatter plots for geospatial data.",
            "parameters": {
                "type": "object",
                "properties": {
                    "lat_col": {
                        "type": "string",
                        "description": "latitude column",
                        "items": {"type": "string"}
                    },
                    "lon_col": {
                        "type": "string",
                        "description": "longitude column"
                    },
                    "output_folder": {
                        "type": "string",
                        "description": "Folder path to save the generated charts."
                    }
                },
                "required": ["geospatial_cols", "output_folder"]
            }
        },
        {
            "name": "generate_mixed_data_charts",
            "description": "Generate plots for both categorical and numerical data",
            "parameters": {
                "type": "object",
                "properties": {
                    "output_folder": {
                        "type": "string",
                        "description": "Folder path to save the generated charts."
                    }
                },
                "required": ["output_folder"]
            }
        }
    ]

    # API request payload
    url = "https://aiproxy.sanand.workers.dev/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    dataset_name = os.path.splitext(os.path.basename(file_path))[0]
    folder_path = os.path.join(os.getcwd(), dataset_name)
    os.makedirs(folder_path, exist_ok=True)

    data_prompt = f"""
    Analyze the dataset and call the appropriate function(s) to generate graphs. Ensure each function
    receives only the relevant columns and parameters required for its operation. Call the most unique function that
    can be used for the given dataset.

    If there are only numerical and categorical columns, call the mixed function
    The dataset summary is {dataset_summary}
    The folder path to be used is {folder_path}.

    """

    data_to_send = {
        "model": "gpt-4o-mini",
        "response_format": {"type": "text"},
        "messages": [
            {
                "role": "system",
                "content": """You are an intelligent data analyst capable of calling functions to analyze
                              datasets and generate relevant graphs. Your job is of atmost priority and any mistake can
                              cause huge losses. So you call functions very intelligently"""
            },
            {
                "role": "user",
                "content": data_prompt
            }
        ],
        "functions": functions,
        "function_call": "auto"
    }

    # Make the API request
    response = requests.post(url, headers=headers, json=data_to_send)

    if response.status_code == 200:
        result = response.json()
        # Extracting the function call result
        function_call_result = result.get("choices", [{}])[0].get(
            "message", {}).get("function_call", {})
        try:
            func = eval(function_call_result.get("name"))
            args = eval(function_call_result.get("arguments"))
            func(data, **args)
        except:
            generate_mixed_data_charts(data, folder_path)
        print("Analysis Done")
        return None
    else:
        print(response.text)
        print(f"Error: {response.status_code}")
        raise Exception("Didn't work")


def data_summary(data, numerical_cols, categorical_cols, time_series_cols, text_cols):
    numerical_summary = data[numerical_cols].describe().T
    numerical_summary['skewness'] = data[numerical_cols].skew()
    numerical_summary['kurtosis'] = data[numerical_cols].apply(kurtosis)

    # Correlation Analysis
    correlation_matrix = data[numerical_cols].corr()

    # Categorical Summary
    categorical_summary = {}
    for col in categorical_cols:
        freq_table = data[col].value_counts(normalize=True).head(5)
        categorical_summary[col] = freq_table.to_dict()

    # Chi-Squared Test for Independence (Example for Pairs of Categorical Variables)
    chi_squared_results = {}
    for i, col1 in enumerate(categorical_cols):
        for col2 in categorical_cols[i + 1:]:
            contingency_table = pd.crosstab(data[col1], data[col2])
            chi2, p, _, _ = chi2_contingency(contingency_table)
            chi_squared_results[f"{col1} vs {col2}"] = {
                'chi2': chi2, 'p_value': p}

    # Time-Series Summary (Feature Extraction)
    time_series_summary = {}
    for col in time_series_cols:
        time_series_summary[col] = {
            'start_date': data[col].min(),
            'end_date': data[col].max(),
            'unique_dates': data[col].nunique()
        }

    # Text Summary
    text_summary = {}
    for col in text_cols:
        text_summary[col] = {
            'avg_length': data[col].str.len().mean(),
            'max_length': data[col].str.len().max(),
            'top_words': pd.Series(' '.join(data[col]).split()).value_counts().head(5).to_dict()
        }

    # Final Summary
    dataset_summary = {
        "numerical_summary": numerical_summary.to_dict(),
        "correlation_matrix": correlation_matrix.to_dict(),
        "categorical_summary": categorical_summary,
        "chi_squared_results": chi_squared_results,
        "time_series_summary": time_series_summary,
        "text_summary": text_summary,
    }

    return dataset_summary


def save_readme(file_path: str, llm_response: str) -> None:
    """
    Saves the LLM response and analysis summary in a README.md file inside a folder
    named after the dataset (folder name will be the same as the dataset filename).

    :param file_path: Path to the dataset CSV file.
    :param llm_response: The response from the LLM to be written into the README.
    """
    readme_path = get_readme(file_path)

    with open(readme_path, "a", encoding="utf-8") as f:
        f.write("\n## LLM Insights\n")
        f.write(f"{llm_response}\n")

    print(f"Analysis complete. Results saved to {readme_path} and charts.")


def get_readme(file_path: str) -> str:
    """
    Generates the file path for the README.md based on the dataset name.

    :param file_path: Path to the dataset file.
    :return: Path to the README.md file inside a folder named after the dataset.
    """
    dataset_name = os.path.splitext(os.path.basename(file_path))[0]
    folder_path = os.path.join(os.getcwd(), dataset_name)
    os.makedirs(folder_path, exist_ok=True)

    readme_path = os.path.join(folder_path, "README.md")
    return readme_path


"""
Chart Functions for Data Analysis
"""

# 1. Numerical Data Charts


def generate_numerical_charts(data: pd.DataFrame, numerical_cols: list, output_folder: str) -> None:
    """
    Generates histograms, box plots, and correlation heatmaps for numerical data.

    :param data: Pandas DataFrame containing the dataset.
    :param numerical_cols: List of numerical column names.
    :param output_folder: Path to save the generated charts.
    """
    for col in numerical_cols:
        # Histogram
        plt.figure(figsize=(8, 6))
        sns.histplot(data[col], kde=True, bins=30, color='blue')
        plt.title(f"Histogram of {col}")
        plt.xlabel(col)
        plt.ylabel("Frequency")
        plt.savefig(os.path.join(output_folder, f"{col}_histogram.png"))
        plt.close()

        # Box Plot
        plt.figure(figsize=(6, 4))
        sns.boxplot(y=data[col], color='green')
        plt.title(f"Box Plot of {col}")
        plt.ylabel(col)
        plt.savefig(os.path.join(output_folder, f"{col}_boxplot.png"))
        plt.close()

    # Correlation Heatmap
    if len(numerical_cols) > 1:
        plt.figure(figsize=(10, 8))
        correlation_matrix = data[numerical_cols].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
        plt.title("Correlation Matrix")
        plt.savefig(os.path.join(output_folder, "correlation_matrix.png"))
        plt.close()


# 2. Categorical Data Charts
def generate_categorical_charts(data: pd.DataFrame, categorical_cols: list, output_folder: str) -> None:
    """
    Generates bar plots for categorical data.

    :param data: Pandas DataFrame containing the dataset.
    :param categorical_cols: List of categorical column names.
    :param output_folder: Path to save the generated charts.
    """
    for col in categorical_cols:
        plt.figure(figsize=(10, 6))
        sns.countplot(data=data, x=col, palette="viridis",
                      hue=col, legend=False)
        plt.title(f"Count Plot of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.savefig(os.path.join(output_folder, f"{col}_countplot.png"))
        plt.close()


# 3. Time-Series Data Charts
def generate_time_series_charts(data: pd.DataFrame, time_col: str, value_cols: list, output_folder: str) -> None:
    """
    Generates line plots for time-series data.

    :param data: Pandas DataFrame containing the dataset.
    :param time_col: Name of the time column.
    :param value_cols: List of columns to plot over time.
    :param output_folder: Path to save the generated charts.
    """
    for col in value_cols:
        plt.figure(figsize=(12, 6))
        plt.plot(data[time_col], data[col],
                 marker='o', linestyle='-', label=col)
        plt.title(f"Time-Series Plot of {col}")
        plt.xlabel(time_col)
        plt.ylabel(col)
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(output_folder, f"{col}_timeseries.png"))
        plt.close()


# 4. Text Data Charts
def generate_text_charts(data: pd.DataFrame, text_cols: str, output_folder: str) -> None:
    """
    Generates a word cloud for text data.

    :param data: Pandas DataFrame containing the dataset.
    :param text_col: Name of the text column.
    :param output_folder: Path to save the generated charts.
    """
    text_data = " ".join(data[text_cols].dropna().astype(str))
    wordcloud = WordCloud(width=800, height=400,
                          background_color='white').generate(text_data)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for {text_cols}")
    plt.savefig(os.path.join(output_folder, f"{text_cols}_wordcloud.png"))
    plt.close()


# 5. Geospatial Data Charts
def generate_geospatial_charts(data: pd.DataFrame, lat_col: str, lon_col: str, output_folder: str) -> None:
    """
    Generates a scatter plot for geospatial data.

    :param data: Pandas DataFrame containing the dataset.
    :param lat_col: Name of the latitude column.
    :param lon_col: Name of the longitude column.
    :param output_folder: Path to save the generated charts.
    """
    plt.figure(figsize=(10, 6))
    plt.scatter(data[lon_col], data[lat_col], c='red', alpha=0.5)
    plt.title("Geospatial Scatter Plot")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.grid(True)
    plt.savefig(os.path.join(output_folder, "geospatial_scatterplot.png"))
    plt.close()


# 6. Mixed Data Charts
def generate_mixed_data_charts(data: pd.DataFrame, output_folder: str) -> None:
    """
    Detects data types and generates appropriate charts for each type.

    :param data: Pandas DataFrame containing the dataset.
    :param output_folder: Path to save the generated charts.
    """
    numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
    categorical_cols = data.select_dtypes(
        include=['object', 'category']).columns.tolist()

    if numerical_cols:
        generate_numerical_charts(data, numerical_cols, output_folder)
    if categorical_cols:
        generate_categorical_charts(data, categorical_cols, output_folder)


@retry(stop=stop_after_attempt(3))
def analyze_image_with_llm(image_path: str, api_key: str) -> Optional[str]:
    """
    Sends image data with reduced quality to an LLM via the proxy API and returns analysis results in markdown format.

    :param image_path: Path to the image file.
    :param api_key: OpenAI API key for authenticating the request.
    :return: string with the LLM's analysis results, or None if the request fails.
    """
    # Load and compress the image
    try:
        img = Image.open(image_path)
        img_buffer = io.BytesIO()
        img.save(img_buffer, format="PNG")
        img_buffer.seek(0)
    except Exception as e:
        print(f"Error loading or compressing image: {e}")
        return None

    # Prepare the API request
    url = "https://aiproxy.sanand.workers.dev/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    data_prompt = f"""
    Analyze the provided correlation matrix and provide some insights. Heading should be correlation matrix analysis

    Return your findings in a MARKDOWN format, using HTML to enhance the readability of the results.
    """

    # Convert image bytes to a base64 string
    import base64
    img_base64 = base64.b64encode(img_buffer.read()).decode('utf-8')

    data_to_send = {
        "model": "gpt-4o-mini",
        "response_format": {"type": "text"},
        "messages": [
            {
                "role": "system",
                "content": """You are an intelligent image analyst capable of providing insights from images.
                    You describe the image in detail and provide interesting observations.
                    You always return your findings in a MARKDOWN format, using HTML to enhance readability.
                    Avoid placeholder or dummy content."""
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": data_prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_base64}",
                            "detail": "low"
                        }
                    }
                ]
            }
        ],
    }

    # Send the request
    response = requests.post(url, headers=headers, json=data_to_send)

    if response.status_code == 200:
        result = response.json()
        analysis = result["choices"][0]["message"]["content"]
        return analysis
    else:
        print(response.text)
        print(f"Error: {response.status_code}")
        raise Exception("Didn't work")


def main(file_path: str) -> None:
    """
    Orchestrates the loading, summarizing, and analyzing of the dataset.
    It generates summary statistics, visualizations, and interacts with an LLM for further analysis.

    :param file_path: Path to the CSV file to be analyzed.
    """
    data = load_data(file_path)

    numerical_summary, categorical_summary, missing_values = summarize_data(
        data)

    readme_path = get_readme(file_path)

    with open(readme_path, "w") as f:
        f.write("# Data Summary\n")
        f.write("## Numerical Summary\n")
        f.write(numerical_summary.to_markdown())
        f.write("\n## Categorical Summary\n")
        f.write(categorical_summary.to_markdown())
        f.write("\n## Missing Values\n")
        f.write(missing_values.to_markdown())

    corr_path = generate_visualizations(data, file_path)
    llm_response = None
    try:
        llm_response = analyze_image_with_llm(corr_path, AIPROXY_TOKEN)
    except:
        pass
    if llm_response:
        save_readme(file_path, llm_response)
    try:
        llm_response = analyze_with_llm(file_path, AIPROXY_TOKEN)
    except:
        pass
    if llm_response:
        save_readme(file_path, llm_response)
    try:
        analyze_and_generate_graphs(data, AIPROXY_TOKEN)
    except:
        pass
    print("Analysis complete. Results saved to README.md and charts.")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Please provide a dataset filename.")
    else:
        file_path = sys.argv[1]
        main(file_path)