import pandas as pd import numpy as np import openai from sklearn.cluster import KMeans import seaborn as sns # type: ignore import matplotlib.pyplot as plt import os from openai import OpenAI client = OpenAI(api_key=os.getenv("AIRPROXY_TOKEN")) os.environ['AIRPROXY_TOKEN'] # Load the CSV file df_goodread = pd.read_csv('goodreads.csv') pd.set_option('display.max_columns', None) print("Overviewe of the data") print(df_goodread.info()) print("\nSummary Statistics:") print(df_goodread.describe(include="all")) missing_values = df_goodread.isnull().sum() print("\nMissing Values:") print(missing_values) df_goodread.dropna(inplace=True) print("Missing values dropped.") print(df_goodread.info()) missing_values = df_goodread.isnull().sum() print("\nMissing Values after removal:") print(missing_values) correlation_matrix = df_goodread.select_dtypes(include=[np.number]).corr() print("\nCorrelation Matrix:") print(correlation_matrix) # Correlation Heatmap sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm") plt.title("Correlation Matrix Heatmap") plt.axis("equal") plt.show() # Clusters num_cols = df_goodread.select_dtypes(include=[np.number]) if not num_cols.empty: kmeans = KMeans(n_clusters=3, random_state=42) df_goodread["Cluster"] = kmeans.fit_predict(num_cols) print("\nCluster Labels Assigned.") z_scores = np.abs((df_goodread.select_dtypes(include=[np.number]) - df_goodread.select_dtypes(include=[np.number]).mean()) / df_goodread.select_dtypes(include=[np.number]).std()) outliers = (z_scores > 3).any(axis=1) print(f"\nOutlier Count: {outliers.sum()}") data_summary = { "column_names": df_goodread.columns.tolist(), "column_types": df_goodread.dtypes.astype(str).tolist(), "missing_values": missing_values.to_dict(), "sample_data": df_goodread.head(3).to_dict(), "summary_statistics": df_goodread.describe(include="all").to_dict(), } #LLM - GPT-4o-mini response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a data analysis assistant"}, { "role": "user", "content": f"""Analyze the following dataset summary: {data_summary} Suggest specific analyses to gain more insights.""" }, ], )