import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 加载和预处理数据
data = pd.read_csv('data.csv')
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
data = data.sample(frac=1).reset_index(drop=True)

# 划分训练集和测试集
train_size = int(0.7 * len(data))
X_train = data.iloc[:train_size, 1:].values
y_train = data.iloc[:train_size, 0].values
X_test = data.iloc[train_size:, 1:].values
y_test = data.iloc[train_size:, 0].values

# 标准化特征
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)
X_train = (X_train - X_mean) / X_std
X_test = (X_test - X_mean) / X_std


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def cost(theta, X, y):
    m = len(y)
    h = sigmoid(X.dot(theta))
    return (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))


def gradient_descent(X, y, theta, learning_rate, iterations):
    m = len(y)
    for _ in range(iterations):
        gradient = X.T.dot(sigmoid(X.dot(theta)) - y) / m
        theta -= learning_rate * gradient
    return theta


def predict_proba(X, theta):
    return sigmoid(X.dot(theta))


def predict(X, theta, threshold=0.5):
    return predict_proba(X, theta) >= threshold


# 添加偏置项
X_train = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], axis=1)
X_test = np.concatenate([np.ones((X_test.shape[0], 1)), X_test], axis=1)

# 初始化参数并训练模型
theta = np.zeros(X_train.shape[1])
theta = gradient_descent(X_train, y_train, theta, 0.01, 1000)

# 进行预测
y_pred_prob = predict_proba(X_test, theta)
y_pred = predict(X_test, theta)


# 手动计算性能指标
def manual_metrics(y_true, y_pred):
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (TP + TN) / len(y_true)
    precision = TP / (TP + FP) if TP + FP else 0
    recall = TP / (TP + FN) if TP + FN else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0

    return accuracy, precision, recall, f1


accuracy, precision, recall, f1 = manual_metrics(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


# 绘制PR曲线
def plot_pr_curve(y_true, y_score):
    thresholds = np.sort(y_score)
    precision = []
    recall = []
    for threshold in thresholds:
        y_pred = y_score >= threshold
        TP = np.sum((y_true == 1) & (y_pred == 1))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        FN = np.sum((y_true == 1) & (y_pred == 0))
        prec = TP / (TP + FP) if TP + FP else 0
        rec = TP / (TP + FN) if TP + FN else 0
        precision.append(prec)
        recall.append(rec)
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('PR Curve')
    plt.show()


plot_pr_curve(y_test, y_pred_prob)


# 绘制ROC曲线
def plot_roc_curve(y_true, y_score):
    thresholds = np.sort(y_score)
    tpr = []
    fpr = []
    for threshold in thresholds:
        y_pred = y_score >= threshold
        TP = np.sum((y_true == 1) & (y_pred == 1))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        TN = np.sum((y_true == 0) & (y_pred == 0))
        FN = np.sum((y_true == 1) & (y_pred == 0))
        tpr.append(TP / (TP + FN) if TP + FN else 0)
        fpr.append(FP / (FP + TN) if FP + TN else 0)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()


plot_roc_curve(y_test, y_pred_prob)