{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "XjTc5n2flYUu" }, "source": [ "# 특성 공학과 규제" ] }, { "cell_type": "markdown", "metadata": { "id": "B8YOr2hElYUv" }, "source": [ "\n", " \n", "
\n", " 구글 코랩에서 실행하기\n", "
" ] }, { "cell_type": "markdown", "metadata": { "id": "fZwhQU2l8tI6" }, "source": [ "## 데이터 준비" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "id": "3kjaTfOqEVwY" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8qmTS1RzKRKT", "outputId": "62a1790e-34bf-4a28-f135-e66469a5f25b" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[ 8.4 2.11 1.41]\n", " [13.7 3.53 2. ]\n", " [15. 3.82 2.43]\n", " [16.2 4.59 2.63]\n", " [17.4 4.59 2.94]\n", " [18. 5.22 3.32]\n", " [18.7 5.2 3.12]\n", " [19. 5.64 3.05]\n", " [19.6 5.14 3.04]\n", " [20. 5.08 2.77]\n", " [21. 5.69 3.56]\n", " [21. 5.92 3.31]\n", " [21. 5.69 3.67]\n", " [21.3 6.38 3.53]\n", " [22. 6.11 3.41]\n", " [22. 5.64 3.52]\n", " [22. 6.11 3.52]\n", " [22. 5.88 3.52]\n", " [22. 5.52 4. ]\n", " [22.5 5.86 3.62]\n", " [22.5 6.79 3.62]\n", " [22.7 5.95 3.63]\n", " [23. 5.22 3.63]\n", " [23.5 6.28 3.72]\n", " [24. 7.29 3.72]\n", " [24. 6.38 3.82]\n", " [24.6 6.73 4.17]\n", " [25. 6.44 3.68]\n", " [25.6 6.56 4.24]\n", " [26.5 7.17 4.14]\n", " [27.3 8.32 5.14]\n", " [27.5 7.17 4.34]\n", " [27.5 7.05 4.34]\n", " [27.5 7.28 4.57]\n", " [28. 7.82 4.2 ]\n", " [28.7 7.59 4.64]\n", " [30. 7.62 4.77]\n", " [32.8 10.03 6.02]\n", " [34.5 10.26 6.39]\n", " [35. 11.49 7.8 ]\n", " [36.5 10.88 6.86]\n", " [36. 10.61 6.74]\n", " [37. 10.84 6.26]\n", " [37. 10.57 6.37]\n", " [39. 11.14 7.49]\n", " [39. 11.14 6. ]\n", " [39. 12.43 7.35]\n", " [40. 11.93 7.11]\n", " [40. 11.73 7.22]\n", " [40. 12.38 7.46]\n", " [40. 11.14 6.63]\n", " [42. 12.8 6.87]\n", " [43. 11.93 7.28]\n", " [43. 12.51 7.42]\n", " [43.5 12.6 8.14]\n", " [44. 12.49 7.6 ]]\n" ] } ], "source": [ "df = pd.read_csv('https://bit.ly/perch_csv_data')\n", "perch_full = df.to_numpy()\n", "print(perch_full)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "id": "PsRC7rvE9SbL" }, "outputs": [], "source": [ "import numpy as np\n", "\n", "perch_weight = np.array(\n", " [5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0,\n", " 110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0,\n", " 130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0,\n", " 197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0,\n", " 514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0,\n", " 820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0,\n", " 1000.0, 1000.0]\n", " )" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "id": "cRKkoWoZ9J0m" }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state=42)" ] }, { "cell_type": "markdown", "metadata": { "id": "y5uMFE_8V1tx" }, "source": [ "## 사이킷런의 변환기" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "id": "EclugdXmSs-L" }, "outputs": [], "source": [ "from sklearn.preprocessing import PolynomialFeatures" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "V5is7cZhKbPU", "outputId": "a27d2b55-7547-4add-b3e9-3a97c23eb954" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[1. 2. 3. 4. 6. 9.]]\n" ] } ], "source": [ "poly = PolynomialFeatures()\n", "poly.fit([[2, 3]])\n", "print(poly.transform([[2, 3]]))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bKXkK0oJc4nG", "outputId": "ee304884-059a-4e6a-c358-64530cb1129d" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[2. 3. 4. 6. 9.]]\n" ] } ], "source": [ "poly = PolynomialFeatures(include_bias=False)\n", "poly.fit([[2, 3]])\n", "print(poly.transform([[2, 3]]))" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "id": "__kE6eJdNZfm" }, "outputs": [], "source": [ "poly = PolynomialFeatures(include_bias=False)\n", "\n", "poly.fit(train_input)\n", "train_poly = poly.transform(train_input)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2a_lmkKle4kF", "outputId": "15ad4a84-a8e1-4079-8cc6-6bb9ef128e2f" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(42, 9)\n" ] } ], "source": [ "print(train_poly.shape)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "X6GUwfXTfKbl", "outputId": "a47e9c30-9730-497c-e897-8ff248cc9074" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2',\n", " 'x2^2'], dtype=object)" ] }, "metadata": {}, "execution_count": 40 } ], "source": [ "poly.get_feature_names_out()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "id": "DJMPxe2mgbOo" }, "outputs": [], "source": [ "test_poly = poly.transform(test_input)" ] }, { "cell_type": "markdown", "metadata": { "id": "PdDAslHzNk3H" }, "source": [ "## 다중 회귀 모델 훈련하기" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "E9ygy-8WOvIP", "outputId": "73ec508e-d9f9-4147-bad6-21109aafd93d" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.9903183436982125\n" ] } ], "source": [ "from sklearn.linear_model import LinearRegression\n", "\n", "lr = LinearRegression()\n", "lr.fit(train_poly, train_target)\n", "print(lr.score(train_poly, train_target))" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GKKyfFcAd7zm", "outputId": "80944369-a45b-4e14-c38e-0652efe39723" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.9714559911594111\n" ] } ], "source": [ "print(lr.score(test_poly, test_target))" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "id": "2fDt5mrReMwU" }, "outputs": [], "source": [ "poly = PolynomialFeatures(degree=5, include_bias=False)\n", "\n", "poly.fit(train_input)\n", "train_poly = poly.transform(train_input)\n", "test_poly = poly.transform(test_input)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hcM8R4VHSzR8", "outputId": "ab384f3c-04d9-41b3-b5c7-f6ac6de6df9e" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(42, 55)\n" ] } ], "source": [ "print(train_poly.shape)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UffVFVTGP8xj", "outputId": "c76714a5-18eb-46a1-d29f-6f26c5a83a19" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.9999999999996433\n" ] } ], "source": [ "lr.fit(train_poly, train_target)\n", "print(lr.score(train_poly, train_target))" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GtITdlYFg7AY", "outputId": "c2c167cf-e680-44a5-a7ff-71efe48b9a03" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "-144.40579436844948\n" ] } ], "source": [ "print(lr.score(test_poly, test_target))" ] }, { "cell_type": "markdown", "metadata": { "id": "K2YMPSelQBpO" }, "source": [ "## 규제" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "id": "hCC7wKy3QQrE" }, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", "\n", "ss = StandardScaler()\n", "ss.fit(train_poly)\n", "\n", "train_scaled = ss.transform(train_poly)\n", "test_scaled = ss.transform(test_poly)" ] }, { "cell_type": "markdown", "metadata": { "id": "qyLI7JQsJ7RQ" }, "source": [ "## 릿지" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LdNuDNQGQipv", "outputId": "11558a6c-0c97-4b1f-f345-933018b18221" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.9896101671037343\n" ] } ], "source": [ "from sklearn.linear_model import Ridge\n", "\n", "ridge = Ridge()\n", "ridge.fit(train_scaled, train_target)\n", "print(ridge.score(train_scaled, train_target))" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "p5mXLecwhdnF", "outputId": "0e2ddf45-68a9-4f8b-8ab2-e4b9f5360087" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.9790693977615387\n" ] } ], "source": [ "print(ridge.score(test_scaled, test_target))" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "id": "wXd3_Kq6hlbM" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "train_score = []\n", "test_score = []" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "id": "9MvIvQOrhfqC" }, "outputs": [], "source": [ "alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]\n", "for alpha in alpha_list:\n", " # 릿지 모델을 만듭니다\n", " ridge = Ridge(alpha=alpha)\n", " # 릿지 모델을 훈련합니다\n", " ridge.fit(train_scaled, train_target)\n", " # 훈련 점수와 테스트 점수를 저장합니다\n", " train_score.append(ridge.score(train_scaled, train_target))\n", " test_score.append(ridge.score(test_scaled, test_target))" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "95DjrJxlhiow", "outputId": "a004ea2c-a338-448d-9dbd-2b68aff09892" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ], "source": [ "plt.plot(np.log10(alpha_list), train_score)\n", "plt.plot(np.log10(alpha_list), test_score)\n", "plt.xlabel('alpha')\n", "plt.ylabel('R^2')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5S5vhi-vhjzT", "outputId": "ac52eb28-1bf2-434f-d21f-b123a6d1f0cd" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.9903815817570367\n", "0.9827976465386928\n" ] } ], "source": [ "ridge = Ridge(alpha=0.1)\n", "ridge.fit(train_scaled, train_target)\n", "\n", "print(ridge.score(train_scaled, train_target))\n", "print(ridge.score(test_scaled, test_target))" ] }, { "cell_type": "markdown", "metadata": { "id": "jUph9pH_KA9_" }, "source": [ "## 라쏘" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ymu-jmekh0IK", "outputId": "dab813ee-5531-4905-93b2-52d16c8e8b21" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.989789897208096\n" ] } ], "source": [ "from sklearn.linear_model import Lasso\n", "\n", "lasso = Lasso()\n", "lasso.fit(train_scaled, train_target)\n", "print(lasso.score(train_scaled, train_target))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "t3jO37UMh2iI", "outputId": "232674e2-4d08-4ae2-bba0-040a72a53e44" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.9800593698421883\n" ] } ], "source": [ "print(lasso.score(test_scaled, test_target))" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uoL2oJ6Ih4Jw", "outputId": "30dffbb7-9b86-42e5-fa32-5953a4b8efa3", "scrolled": true }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.878e+04, tolerance: 5.183e+02\n", " model = cd_fast.enet_coordinate_descent(\n", "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_coordinate_descent.py:631: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.297e+04, tolerance: 5.183e+02\n", " model = cd_fast.enet_coordinate_descent(\n" ] } ], "source": [ "train_score = []\n", "test_score = []\n", "\n", "alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]\n", "for alpha in alpha_list:\n", " # 라쏘 모델을 만듭니다\n", " lasso = Lasso(alpha=alpha, max_iter=10000)\n", " # 라쏘 모델을 훈련합니다\n", " lasso.fit(train_scaled, train_target)\n", " # 훈련 점수와 테스트 점수를 저장합니다\n", " train_score.append(lasso.score(train_scaled, train_target))\n", " test_score.append(lasso.score(test_scaled, test_target))" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 449 }, "id": "7rkH8Dvzh9UI", "outputId": "a1b647d7-25d5-4206-edc8-1bf07a279d9d" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ], "source": [ "plt.plot(np.log10(alpha_list), train_score)\n", "plt.plot(np.log10(alpha_list), test_score)\n", "plt.xlabel('alpha')\n", "plt.ylabel('R^2')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "t4uFD9Flh_Dw", "outputId": "6946c4fd-97af-44b4-b99a-facb65d14661" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0.9888067471131867\n", "0.9824470598706695\n" ] } ], "source": [ "lasso = Lasso(alpha=10)\n", "lasso.fit(train_scaled, train_target)\n", "\n", "print(lasso.score(train_scaled, train_target))\n", "print(lasso.score(test_scaled, test_target))" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "z_bQc3s8Uoai", "outputId": "5ad2a313-3880-4078-ac63-4019049b5ec4" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "40\n" ] } ], "source": [ "print(np.sum(lasso.coef_ == 0))" ] } ], "metadata": { "colab": { "name": "3-3 특성 공학과 규제.ipynb", "provenance": [] }, "kernelspec": { "display_name": "default:Python", "language": "python", "name": "conda-env-default-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.10" } }, "nbformat": 4, "nbformat_minor": 0 }