{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wjTQIGaDx8K2"
      },
      "source": [
        "# 7章　線形回帰"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vmSop0bBx8K4"
      },
      "outputs": [],
      "source": [
        "# 必要ライブラリの宣言\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# waring抑止\n",
        "import warnings\n",
        "warnings.filterwarnings('ignore')"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# 「ボストン・データセット」はscikit-learnのライブラリでも取得できるが、\n",
        "# その場合、将来版で利用できなくなる予定のため、別Webサイトから取得する\n",
        "data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n",
        "raw_df = pd.read_csv(data_url, sep=\"\\s+\", \n",
        "    skiprows=22, header=None)\n",
        "x_org = np.hstack([raw_df.values[::2, :], \n",
        "    raw_df.values[1::2, :2]])\n",
        "yt = raw_df.values[1::2, 2]\n",
        "feature_names = np.array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX',\n",
        "    'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO','B', 'LSTAT'])\n",
        "\n",
        "print('元データ', x_org.shape, yt.shape)\n",
        "print('項目名: ', feature_names)\n",
        "\n",
        "# データ絞り込み (項目 RMのみ)\n",
        "x_data = x_org[:,feature_names == 'RM']\n",
        "print('絞り込み後', x_data.shape)\n",
        "\n",
        "# ダミー変数を追加\n",
        "x = np.insert(x_data, 0, 1.0, axis=1)\n",
        "print('ダミー変数追加後', x.shape)"
      ],
      "metadata": {
        "id": "ogSptB2fy1aD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "nuhfFmMqx8K6"
      },
      "outputs": [],
      "source": [
        "# 入力データxの表示 (ダミー変数を含む)\n",
        "print(x.shape)\n",
        "print(x[:5,:])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_S3RqgiZx8K6"
      },
      "outputs": [],
      "source": [
        "# 正解データ yの表示\n",
        "print(yt[:5])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vjeqBe21x8K6"
      },
      "outputs": [],
      "source": [
        "# 散布図の表示\n",
        "plt.scatter(x[:,1], yt, s=10, c='b')\n",
        "plt.xlabel('ROOM', fontsize=14)\n",
        "plt.ylabel('PRICE', fontsize=14)\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yRwHSUstx8K6"
      },
      "outputs": [],
      "source": [
        "# 予測関数 (1, x)の値から予測値ypを計算する\n",
        "def pred(x, w):\n",
        "    return(x @ w)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SjlZZfWox8K7"
      },
      "outputs": [],
      "source": [
        "# 初期化処理\n",
        "\n",
        "# データ系列総数\n",
        "M  = x.shape[0]\n",
        "\n",
        "# 入力データ次元数(ダミー変数を含む)\n",
        "D = x.shape[1]\n",
        "\n",
        "# 繰り返し回数\n",
        "iters = 50000\n",
        "\n",
        "# 学習率\n",
        "alpha = 0.01\n",
        "\n",
        "# 重みベクトルの初期値 (すべての値を1にする)\n",
        "w = np.ones(D)\n",
        "\n",
        "# 評価結果記録用 (損失関数値のみ記録)\n",
        "history = np.zeros((0,2))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PmiVJGGNx8K7"
      },
      "outputs": [],
      "source": [
        "# 繰り返しループ\n",
        "for k in range(iters):\n",
        "    \n",
        "    # 予測値の計算 (7.8.1)\n",
        "    yp = pred(x, w)\n",
        "    \n",
        "    # 誤差の計算 (7.8.2)\n",
        "    yd = yp - yt\n",
        "    \n",
        "    # 勾配降下法の実装 (7.8.4)\n",
        "    w = w - alpha * (x.T @ yd) / M\n",
        "    \n",
        "    # 学習曲線描画用データの計算、保存\n",
        "    if ( k % 100 == 0):\n",
        "        # 損失関数値の計算 (7.6.1)\n",
        "        loss = np.mean(yd ** 2) / 2\n",
        "        # 計算結果の記録\n",
        "        history = np.vstack((history, np.array([k, loss])))\n",
        "        # 画面表示\n",
        "        print( \"iter = %d  loss = %f\" % (k, loss))    "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N0ZHKZ6ox8K7"
      },
      "outputs": [],
      "source": [
        "# 最終的な損失関数初期値、最終値\n",
        "print('損失関数初期値: %f' % history[0,1])\n",
        "print('損失関数最終値: %f' % history[-1,1])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "jgO6KaBHx8K8"
      },
      "outputs": [],
      "source": [
        "# 下記直線描画用の座標値計算\n",
        "xall = x[:,1].ravel()\n",
        "xl = np.array([[1, xall.min()],[1, xall.max()]])\n",
        "yl = pred(xl, w)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "S0G50TqVx8K8"
      },
      "outputs": [],
      "source": [
        "# 散布図と回帰直線の描画\n",
        "plt.figure(figsize=(6,6))\n",
        "plt.scatter(x[:,1], yt, s=10, c='b')\n",
        "plt.xlabel('ROOM', fontsize=14)\n",
        "plt.ylabel('PRICE', fontsize=14)\n",
        "plt.plot(xl[:,1], yl, c='k')\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "JDl7UpOwx8K8"
      },
      "outputs": [],
      "source": [
        "# 学習曲線の表示 (最初の1個分を除く)\n",
        "plt.plot(history[1:,0], history[1:,1])\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "m8ZNNOdgx8K8"
      },
      "source": [
        "## 7.10  重回帰モデルへの拡張"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qerEhaTyx8K8"
      },
      "outputs": [],
      "source": [
        "# 列(LSTAT: 低所得者率)の追加\n",
        "x_add = x_org[:,feature_names == 'LSTAT']\n",
        "x2 = np.hstack((x, x_add))\n",
        "print(x2.shape)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PycbKxvUx8K9"
      },
      "outputs": [],
      "source": [
        "# 入力データxの表示 (ダミーデータを含む)\n",
        "print(x2[:5,:])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Klw7-ngkx8K9"
      },
      "outputs": [],
      "source": [
        "# 初期化処理\n",
        "\n",
        "# データ系列総数\n",
        "M  = x2.shape[0]\n",
        "\n",
        "# 入力データ次元数(ダミー変数を含む)\n",
        "D = x2.shape[1]\n",
        "\n",
        "# 繰り返し回数\n",
        "iters = 50000\n",
        "\n",
        "# 学習率\n",
        "alpha = 0.01\n",
        "\n",
        "# 重みベクトルの初期値 (すべての値を1にする)\n",
        "w = np.ones(D)\n",
        "\n",
        "# 評価結果記録用 (損失関数値のみ記録)\n",
        "history = np.zeros((0,2))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vT23CBQtx8K9"
      },
      "outputs": [],
      "source": [
        "# 繰り返しループ\n",
        "for k in range(iters):\n",
        "    \n",
        "    # 予測値の計算 (7.8.1)\n",
        "    yp = pred(x2, w)\n",
        "    \n",
        "    # 誤差の計算 (7.8.2)\n",
        "    yd = yp - yt\n",
        "    \n",
        "    # 勾配降下法の実装 (7.8.4)\n",
        "    w = w - alpha * (x2.T @ yd) / M\n",
        "    \n",
        "    # 学習曲線描画用データの計算、保存\n",
        "    if ( k % 100 == 0):\n",
        "        # 損失関数値の計算 (7.6.1)\n",
        "        loss = np.mean(yd ** 2) / 2\n",
        "        # 計算結果の記録\n",
        "        history = np.vstack((history, np.array([k, loss])))\n",
        "        # 画面表示\n",
        "        print( \"iter = %d  loss = %f\" % (k, loss))    "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "jk6RHhEWx8K9"
      },
      "outputs": [],
      "source": [
        "# 初期化処理 (パラメータを適切な値に変更)\n",
        "\n",
        "# データ系列総数\n",
        "M  = x2.shape[0]\n",
        "\n",
        "# 入力データ次元数(ダミー変数を含む)\n",
        "D = x2.shape[1]\n",
        "\n",
        "# 繰り返し回数\n",
        "#iters = 50000\n",
        "iters = 2000\n",
        "\n",
        "# 学習率\n",
        "#alpha = 0.01\n",
        "alpha = 0.001\n",
        "\n",
        "# 重みベクトルの初期値 (すべての値を1にする)\n",
        "w = np.ones(D)\n",
        "\n",
        "# 評価結果記録用 (損失関数値のみ記録)\n",
        "history = np.zeros((0,2))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kOwfmpMdx8K9"
      },
      "outputs": [],
      "source": [
        "# 繰り返しループ\n",
        "for k in range(iters):\n",
        "    \n",
        "    # 予測値の計算 (7.8.1)\n",
        "    yp = pred(x2, w)\n",
        "    \n",
        "    # 誤差の計算 (7.8.2)\n",
        "    yd = yp - yt\n",
        "    \n",
        "    # 勾配降下法の実装 (7.8.4)\n",
        "    w = w - alpha * (x2.T @ yd) / M\n",
        "    \n",
        "    # 学習曲線描画用データの計算、保存\n",
        "    if ( k % 100 == 0):\n",
        "        # 損失関数値の計算 (7.6.1)\n",
        "        loss = np.mean(yd ** 2) / 2\n",
        "        # 計算結果の記録\n",
        "        history = np.vstack((history, np.array([k, loss])))\n",
        "        # 画面表示\n",
        "        print( \"iter = %d  loss = %f\" % (k, loss))    "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "6-ro-cXgx8K9"
      },
      "outputs": [],
      "source": [
        "# 最終的な損失関数初期値、最終値\n",
        "print('損失関数初期値: %f' % history[0,1])\n",
        "print('損失関数最終値: %f' % history[-1,1])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tGhwtj-tx8K9"
      },
      "outputs": [],
      "source": [
        "# 学習曲線の表示 (最初の10個分を除く)\n",
        "plt.plot(history[:,0], history[:,1])\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "BfxbFovNzOHD"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.7"
    },
    "colab": {
      "name": "ch07-regression.ipynb",
      "provenance": [],
      "collapsed_sections": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}