# /// script # requires-python = ">=3.11" # dependencies = [ # "pandas", # "matplotlib", # "seaborn", # "numpy", # "requests", # "scikit-learn", # "scipy" # ] # /// import os import sys import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np import requests import json from sklearn.cluster import KMeans from sklearn.ensemble import RandomForestRegressor from scipy.stats import zscore import subprocess # Install missing dependencies dynamically required_libraries = [ "pandas", "matplotlib", "seaborn", "numpy", "requests", "scikit-learn", "scipy" ] for lib in required_libraries: try: __import__(lib) except ImportError: print(f"Installing missing library: {lib}") subprocess.check_call([sys.executable, "-m", "pip", "install", lib]) # AI Proxy details API_URL = "https://aiproxy.sanand.workers.dev/openai/v1/chat/completions" API_TOKEN = os.getenv("API_TOKEN") def query_llm(prompt): """Query the LLM using AI Proxy.""" headers = { "Authorization": f"Bearer {API_TOKEN}", "Content-Type": "application/json", } payload = { "model": "gpt-4o-mini", "messages": [ {"role": "system", "content": "You are a data analyst. Provide insights based on the provided dataset summary."}, {"role": "user", "content": prompt} ] } try: response = requests.post(API_URL, headers=headers, json=payload) response.raise_for_status() return response.json()["choices"][0]["message"]["content"] except requests.exceptions.RequestException as e: print(f"Error querying the LLM: {e}") return "Unable to retrieve insights from the LLM due to an error." def detect_outliers(df): """Detect outliers using z-scores.""" numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns outliers = {} for col in numeric_cols: z_scores = zscore(df[col].dropna()) z_scores = pd.Series(z_scores, index=df[col].dropna().index) outliers[col] = df.loc[z_scores[np.abs(z_scores) > 3].index] return outliers def perform_clustering(df): """Perform k-means clustering on numerical data.""" numeric_cols = df.select_dtypes(include=['float64', 'int64']) if len(numeric_cols.columns) < 2: return None # Clustering requires at least 2 features numeric_data = numeric_cols.dropna() kmeans = KMeans(n_clusters=3, random_state=42) clusters = kmeans.fit_predict(numeric_data) # Create a cluster column with NaN for rows with missing values df['Cluster'] = np.nan df.loc[numeric_data.index, 'Cluster'] = clusters return df, kmeans.cluster_centers_ def feature_importance(df, target_col): """Identify feature importance using a random forest regressor.""" numeric_cols = df.select_dtypes(include=['float64', 'int64']).drop(columns=[target_col], errors='ignore') target = df[target_col] if numeric_cols.empty or target.isnull().all(): return None model = RandomForestRegressor(random_state=42) model.fit(numeric_cols.fillna(0), target.fillna(0)) importance = pd.Series(model.feature_importances_, index=numeric_cols.columns) return importance.sort_values(ascending=False) def create_visualizations(df): """Generate visualizations from the dataset and save as PNG.""" correlation_matrix = df.corr(numeric_only=True) plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm") plt.title("Correlation Heatmap") plt.savefig("correlation_heatmap.png") plt.close() for col in df.select_dtypes(include=['float64', 'int64']).columns: plt.figure() sns.histplot(df[col], kde=True) plt.title(f"Distribution of {col}") plt.savefig(f"{col}_distribution.png") plt.close() def generate_readme(summary_stats, missing_values, llm_response, df): """Generate a README.md file with the analysis results.""" llm_response = llm_response or "No insights could be retrieved from the LLM due to an error." with open("README.md", "w") as f: f.write("# Automated Data Analysis Report\n\n") f.write("## Dataset Summary\n") f.write(summary_stats.to_markdown() + "\n\n") f.write("## Missing Values\n") f.write(missing_values.to_markdown() + "\n\n") f.write("## Insights\n") f.write(llm_response + "\n\n") f.write("## Visualizations\n") f.write("![Correlation Heatmap](correlation_heatmap.png)\n") for col in df.select_dtypes(include=['float64', 'int64']).columns: f.write(f"![Distribution of {col}]({col}_distribution.png)\n") def main(): if len(sys.argv) < 2: print("Usage: python autolysis.py ") return filename = sys.argv[1] if not os.path.isfile(filename): print(f"Error: File {filename} not found.") return try: # Load the dataset df = pd.read_csv(filename, encoding="ISO-8859-1") except Exception as e: print(f"Error reading {filename}: {e}") return if df is None or df.empty: print("The dataset is empty or could not be loaded.") return # Perform generic analysis summary_stats = df.describe(include='all') missing_values = df.isnull().sum() # Detect outliers outliers = detect_outliers(df) # Perform clustering clustering_result = perform_clustering(df) if clustering_result: clustered_df, cluster_centers = clustering_result else: print("Clustering could not be performed due to insufficient data.") # Query LLM for insights column_info = {col: str(dtype) for col, dtype in df.dtypes.items()} prompt = f""" Analyze this dataset with the following column information: {json.dumps(column_info, indent=2)} Summary statistics: {summary_stats.to_string()} Missing values: {missing_values.to_string()} Outliers detected: {outliers} """ llm_response = query_llm(prompt) # Create visualizations and generate README create_visualizations(df) generate_readme(summary_stats, missing_values, llm_response, df) print("Analysis complete. Results saved in README.md and visualization PNG files.") if __name__ == "__main__": main()