{ "cells": [ { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "\n", "class LinearRegression:\n", "\n", " def __init__(self):\n", " \"\"\"初始化Linear Regression模型\"\"\"\n", " self.coef_ = None\n", " self.intercept_ = None\n", " self._theta = None\n", "\n", " def fit_normal(self, X_train, y_train):\n", " \"\"\"根据训练数据集X_train, y_train训练Linear Regression模型\"\"\"\n", " assert X_train.shape[0] == y_train.shape[0], \\\n", " \"the size of X_train must be equal to the size of y_train\"\n", "\n", " X_b = np.hstack([np.ones((len(X_train), 1)), X_train])\n", " self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)\n", "\n", " self.intercept_ = self._theta[0]\n", " self.coef_ = self._theta[1:]\n", "\n", " return self\n", "\n", " def predict(self, X_predict):\n", " \"\"\"给定待预测数据集X_predict,返回表示X_predict的结果向量\"\"\"\n", " assert self.intercept_ is not None and self.coef_ is not None, \\\n", " \"must fit before predict!\"\n", " assert X_predict.shape[1] == len(self.coef_), \\\n", " \"the feature number of X_predict must be equal to X_train\"\n", "\n", " X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])\n", " return X_b.dot(self._theta)\n", "\n", " def score(self, x_test, y_test):\n", " \"\"\"根据测试数据集 X_test 和 y_test 确定当前模型的准确度\"\"\"\n", "\n", " y_predict = self.predict(x_test)\n", "# #testY是一维数组,predicY是二维数组,故需要将testY转换一下\n", "# MSE=np.sum(np.power((y_test.reshape(-1,1) - y_predict),2))/len(y_test)\n", "# R2=1-MSE/np.var(y_test)\n", "# print(\"MSE:\",MSE)\n", "# print(\"R2:\", R2)\n", "# print(\"MSE2:\", MSE2)\n", " error = []\n", " for i in range(len(y_test)):\n", " error.append(y_test[i] - y_predict[i])\n", " squaredError = []\n", " for val in error:\n", " squaredError.append(val * val)#target-prediction之差平方 \n", " MSE = sum(squaredError) / len(squaredError)#均方误差MSE\n", " return MSE\n", "\n", " def __repr__(self):\n", " return \"LinearRegression()\"\n", "\n", "\n", "\n", "def train_test_split(X, y, train_size=None ,seed=None):\n", " \"\"\"将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test\"\"\"\n", " assert X.shape[0] == y.shape[0], \\\n", " \"the size of X must be equal to the size of y\"\n", "\n", " if seed:\n", " np.random.seed(seed)\n", "\n", " shuffled_indexes = np.random.permutation(len(X))\n", "\n", "# test_size = int(450)\n", " test_indexes = shuffled_indexes[train_size:]\n", " train_indexes = shuffled_indexes[:train_size]\n", "\n", " X_train = X[train_indexes]\n", " y_train = y[train_indexes]\n", "\n", " X_test = X[test_indexes]\n", " y_test = y[test_indexes]\n", "\n", " return X_train, X_test, y_train, y_test" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | CRIM | \n", "ZN | \n", "INDUS | \n", "CHAS | \n", "NOX | \n", "RM | \n", "AGE | \n", "DIS | \n", "RAD | \n", "TAX | \n", "PTRATIO | \n", "B | \n", "LSTAT | \n", "MEDV | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0.00632 | \n", "18.0 | \n", "2.31 | \n", "0 | \n", "0.538 | \n", "6.575 | \n", "65.2 | \n", "4.0900 | \n", "1 | \n", "296.0 | \n", "15.3 | \n", "396.90 | \n", "4.98 | \n", "24.0 | \n", "
| 1 | \n", "0.02731 | \n", "0.0 | \n", "7.07 | \n", "0 | \n", "0.469 | \n", "6.421 | \n", "78.9 | \n", "4.9671 | \n", "2 | \n", "242.0 | \n", "17.8 | \n", "396.90 | \n", "9.14 | \n", "21.6 | \n", "
| 2 | \n", "0.02729 | \n", "0.0 | \n", "7.07 | \n", "0 | \n", "0.469 | \n", "7.185 | \n", "61.1 | \n", "4.9671 | \n", "2 | \n", "242.0 | \n", "17.8 | \n", "392.83 | \n", "4.03 | \n", "34.7 | \n", "
| 3 | \n", "0.03237 | \n", "0.0 | \n", "2.18 | \n", "0 | \n", "0.458 | \n", "6.998 | \n", "45.8 | \n", "6.0622 | \n", "3 | \n", "222.0 | \n", "18.7 | \n", "394.63 | \n", "2.94 | \n", "33.4 | \n", "
| 4 | \n", "0.06905 | \n", "0.0 | \n", "2.18 | \n", "0 | \n", "0.458 | \n", "7.147 | \n", "54.2 | \n", "6.0622 | \n", "3 | \n", "222.0 | \n", "18.7 | \n", "396.90 | \n", "5.33 | \n", "36.2 | \n", "