{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "wjTQIGaDx8K2" }, "source": [ "# 7章 線形回帰" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vmSop0bBx8K4" }, "outputs": [], "source": [ "# 必要ライブラリの宣言\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "# waring抑止\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "source": [ "# 「ボストン・データセット」はscikit-learnのライブラリでも取得できるが、\n", "# その場合、将来版で利用できなくなる予定のため、別Webサイトから取得する\n", "data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n", "raw_df = pd.read_csv(data_url, sep=\"\\s+\", \n", " skiprows=22, header=None)\n", "x_org = np.hstack([raw_df.values[::2, :], \n", " raw_df.values[1::2, :2]])\n", "yt = raw_df.values[1::2, 2]\n", "feature_names = np.array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX',\n", " 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO','B', 'LSTAT'])\n", "\n", "print('元データ', x_org.shape, yt.shape)\n", "print('項目名: ', feature_names)\n", "\n", "# データ絞り込み (項目 RMのみ)\n", "x_data = x_org[:,feature_names == 'RM']\n", "print('絞り込み後', x_data.shape)\n", "\n", "# ダミー変数を追加\n", "x = np.insert(x_data, 0, 1.0, axis=1)\n", "print('ダミー変数追加後', x.shape)" ], "metadata": { "id": "ogSptB2fy1aD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nuhfFmMqx8K6" }, "outputs": [], "source": [ "# 入力データxの表示 (ダミー変数を含む)\n", "print(x.shape)\n", "print(x[:5,:])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_S3RqgiZx8K6" }, "outputs": [], "source": [ "# 正解データ yの表示\n", "print(yt[:5])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vjeqBe21x8K6" }, "outputs": [], "source": [ "# 散布図の表示\n", "plt.scatter(x[:,1], yt, s=10, c='b')\n", "plt.xlabel('ROOM', fontsize=14)\n", "plt.ylabel('PRICE', fontsize=14)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yRwHSUstx8K6" }, "outputs": [], "source": [ "# 予測関数 (1, x)の値から予測値ypを計算する\n", "def pred(x, w):\n", " return(x @ w)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SjlZZfWox8K7" }, "outputs": [], "source": [ "# 初期化処理\n", "\n", "# データ系列総数\n", "M = x.shape[0]\n", "\n", "# 入力データ次元数(ダミー変数を含む)\n", "D = x.shape[1]\n", "\n", "# 繰り返し回数\n", "iters = 50000\n", "\n", "# 学習率\n", "alpha = 0.01\n", "\n", "# 重みベクトルの初期値 (すべての値を1にする)\n", "w = np.ones(D)\n", "\n", "# 評価結果記録用 (損失関数値のみ記録)\n", "history = np.zeros((0,2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PmiVJGGNx8K7" }, "outputs": [], "source": [ "# 繰り返しループ\n", "for k in range(iters):\n", " \n", " # 予測値の計算 (7.8.1)\n", " yp = pred(x, w)\n", " \n", " # 誤差の計算 (7.8.2)\n", " yd = yp - yt\n", " \n", " # 勾配降下法の実装 (7.8.4)\n", " w = w - alpha * (x.T @ yd) / M\n", " \n", " # 学習曲線描画用データの計算、保存\n", " if ( k % 100 == 0):\n", " # 損失関数値の計算 (7.6.1)\n", " loss = np.mean(yd ** 2) / 2\n", " # 計算結果の記録\n", " history = np.vstack((history, np.array([k, loss])))\n", " # 画面表示\n", " print( \"iter = %d loss = %f\" % (k, loss)) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N0ZHKZ6ox8K7" }, "outputs": [], "source": [ "# 最終的な損失関数初期値、最終値\n", "print('損失関数初期値: %f' % history[0,1])\n", "print('損失関数最終値: %f' % history[-1,1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jgO6KaBHx8K8" }, "outputs": [], "source": [ "# 下記直線描画用の座標値計算\n", "xall = x[:,1].ravel()\n", "xl = np.array([[1, xall.min()],[1, xall.max()]])\n", "yl = pred(xl, w)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "S0G50TqVx8K8" }, "outputs": [], "source": [ "# 散布図と回帰直線の描画\n", "plt.figure(figsize=(6,6))\n", "plt.scatter(x[:,1], yt, s=10, c='b')\n", "plt.xlabel('ROOM', fontsize=14)\n", "plt.ylabel('PRICE', fontsize=14)\n", "plt.plot(xl[:,1], yl, c='k')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JDl7UpOwx8K8" }, "outputs": [], "source": [ "# 学習曲線の表示 (最初の1個分を除く)\n", "plt.plot(history[1:,0], history[1:,1])\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "id": "m8ZNNOdgx8K8" }, "source": [ "## 7.10 重回帰モデルへの拡張" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "qerEhaTyx8K8" }, "outputs": [], "source": [ "# 列(LSTAT: 低所得者率)の追加\n", "x_add = x_org[:,feature_names == 'LSTAT']\n", "x2 = np.hstack((x, x_add))\n", "print(x2.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "PycbKxvUx8K9" }, "outputs": [], "source": [ "# 入力データxの表示 (ダミーデータを含む)\n", "print(x2[:5,:])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Klw7-ngkx8K9" }, "outputs": [], "source": [ "# 初期化処理\n", "\n", "# データ系列総数\n", "M = x2.shape[0]\n", "\n", "# 入力データ次元数(ダミー変数を含む)\n", "D = x2.shape[1]\n", "\n", "# 繰り返し回数\n", "iters = 50000\n", "\n", "# 学習率\n", "alpha = 0.01\n", "\n", "# 重みベクトルの初期値 (すべての値を1にする)\n", "w = np.ones(D)\n", "\n", "# 評価結果記録用 (損失関数値のみ記録)\n", "history = np.zeros((0,2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vT23CBQtx8K9" }, "outputs": [], "source": [ "# 繰り返しループ\n", "for k in range(iters):\n", " \n", " # 予測値の計算 (7.8.1)\n", " yp = pred(x2, w)\n", " \n", " # 誤差の計算 (7.8.2)\n", " yd = yp - yt\n", " \n", " # 勾配降下法の実装 (7.8.4)\n", " w = w - alpha * (x2.T @ yd) / M\n", " \n", " # 学習曲線描画用データの計算、保存\n", " if ( k % 100 == 0):\n", " # 損失関数値の計算 (7.6.1)\n", " loss = np.mean(yd ** 2) / 2\n", " # 計算結果の記録\n", " history = np.vstack((history, np.array([k, loss])))\n", " # 画面表示\n", " print( \"iter = %d loss = %f\" % (k, loss)) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jk6RHhEWx8K9" }, "outputs": [], "source": [ "# 初期化処理 (パラメータを適切な値に変更)\n", "\n", "# データ系列総数\n", "M = x2.shape[0]\n", "\n", "# 入力データ次元数(ダミー変数を含む)\n", "D = x2.shape[1]\n", "\n", "# 繰り返し回数\n", "#iters = 50000\n", "iters = 2000\n", "\n", "# 学習率\n", "#alpha = 0.01\n", "alpha = 0.001\n", "\n", "# 重みベクトルの初期値 (すべての値を1にする)\n", "w = np.ones(D)\n", "\n", "# 評価結果記録用 (損失関数値のみ記録)\n", "history = np.zeros((0,2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kOwfmpMdx8K9" }, "outputs": [], "source": [ "# 繰り返しループ\n", "for k in range(iters):\n", " \n", " # 予測値の計算 (7.8.1)\n", " yp = pred(x2, w)\n", " \n", " # 誤差の計算 (7.8.2)\n", " yd = yp - yt\n", " \n", " # 勾配降下法の実装 (7.8.4)\n", " w = w - alpha * (x2.T @ yd) / M\n", " \n", " # 学習曲線描画用データの計算、保存\n", " if ( k % 100 == 0):\n", " # 損失関数値の計算 (7.6.1)\n", " loss = np.mean(yd ** 2) / 2\n", " # 計算結果の記録\n", " history = np.vstack((history, np.array([k, loss])))\n", " # 画面表示\n", " print( \"iter = %d loss = %f\" % (k, loss)) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "6-ro-cXgx8K9" }, "outputs": [], "source": [ "# 最終的な損失関数初期値、最終値\n", "print('損失関数初期値: %f' % history[0,1])\n", "print('損失関数最終値: %f' % history[-1,1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "tGhwtj-tx8K9" }, "outputs": [], "source": [ "# 学習曲線の表示 (最初の10個分を除く)\n", "plt.plot(history[:,0], history[:,1])\n", "plt.show()" ] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "BfxbFovNzOHD" }, "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "colab": { "name": "ch07-regression.ipynb", "provenance": [], "collapsed_sections": [] } }, "nbformat": 4, "nbformat_minor": 0 }