{"cells":[{"cell_type":"markdown","source":["# 로지스틱 모델 이용전 전처리"],"metadata":{"id":"Q6E1rRRAkfYY"}},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2593,"status":"ok","timestamp":1654840588592,"user":{"displayName":"정태영","userId":"14526705904009737659"},"user_tz":-540},"id":"5y_gTeAHCz1c","outputId":"d4f01162-072e-44b3-83aa-c67028709bb0"},"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting category_encoders\n"," Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)\n","\u001b[K |████████████████████████████████| 69 kB 3.7 MB/s \n","\u001b[?25hRequirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (1.4.1)\n","Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (1.0.2)\n","Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (1.21.6)\n","Requirement already satisfied: statsmodels>=0.9.0 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (0.10.2)\n","Requirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (0.5.2)\n","Requirement already satisfied: pandas>=1.0.5 in /usr/local/lib/python3.7/dist-packages (from category_encoders) (1.3.5)\n","Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0.5->category_encoders) (2.8.2)\n","Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0.5->category_encoders) (2022.1)\n","Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from patsy>=0.5.1->category_encoders) (1.15.0)\n","Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.20.0->category_encoders) (3.1.0)\n","Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.20.0->category_encoders) (1.1.0)\n","Installing collected packages: category-encoders\n","Successfully installed category-encoders-2.5.0\n"]}],"source":["!pip install category_encoders"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TX5AuZMqDJsY","executionInfo":{"status":"ok","timestamp":1654840627829,"user_tz":-540,"elapsed":22397,"user":{"displayName":"정태영","userId":"14526705904009737659"}},"outputId":"c7898d1a-fbc3-4b57-d128-b3b8eda593d7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":3,"metadata":{"id":"YIkcbsJ6DLBA","executionInfo":{"status":"ok","timestamp":1654840630737,"user_tz":-540,"elapsed":851,"user":{"displayName":"정태영","userId":"14526705904009737659"}}},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","#method : scaling, log transformation, cube Root normalization, Box-Cox transformation\n","from scipy.stats import skew \n","from scipy.stats import boxcox\n","\n","%matplotlib inline\n","\n","import warnings\n","warnings.filterwarnings('ignore')"]},{"cell_type":"code","execution_count":123,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":832,"status":"ok","timestamp":1654852037198,"user":{"displayName":"정태영","userId":"14526705904009737659"},"user_tz":-540},"id":"Spn-aO70DTh-","outputId":"8eed398c-8d08-4dd7-fe23-1196170a2424"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["(319795, 18)"]},"metadata":{},"execution_count":123}],"source":["df = pd.read_csv('/content/drive/MyDrive/기계학습 팀플/heart_2020_cleaned.csv')\n","df.shape"]},{"cell_type":"code","execution_count":124,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":309,"status":"ok","timestamp":1654852038691,"user":{"displayName":"정태영","userId":"14526705904009737659"},"user_tz":-540},"id":"5qmHgm9_DUPc","outputId":"4a6256f6-d3eb-410c-e8f5-b242b381c98a"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["(301717, 18)"]},"metadata":{},"execution_count":124}],"source":["df = df.drop_duplicates()\n","df.shape"]},{"cell_type":"code","execution_count":125,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":367,"status":"ok","timestamp":1654852040453,"user":{"displayName":"정태영","userId":"14526705904009737659"},"user_tz":-540},"id":"GTwaL2iFDVwz","outputId":"c241e591-8938-4de1-836a-48d61af3329a"},"outputs":[{"output_type":"execute_result","data":{"text/plain":["(301653, 18)"]},"metadata":{},"execution_count":125}],"source":["# noise 처리\n","df = df[(df['BMI'] < 80) & (df['SleepTime'] != 24)]\n","df.shape"]},{"cell_type":"code","execution_count":126,"metadata":{"id":"5Agz3n1lDXB2","executionInfo":{"status":"ok","timestamp":1654852042524,"user_tz":-540,"elapsed":306,"user":{"displayName":"정태영","userId":"14526705904009737659"}}},"outputs":[],"source":["def yesno_tolabel(x):\n"," if (x == 'Yes') | (x == 'Yes (during pregnancy)') :\n"," return 1\n"," else:\n"," return 0"]},{"cell_type":"code","execution_count":127,"metadata":{"id":"8xz6HrlhDYfN","executionInfo":{"status":"ok","timestamp":1654852047989,"user_tz":-540,"elapsed":1624,"user":{"displayName":"정태영","userId":"14526705904009737659"}}},"outputs":[],"source":["sex_map = {'Female' : 0, 'Male' : 1}\n","health_map = {'Excellent' : 4, 'Very good' : 3, 'Good' : 2, 'Fair' : 1, 'Poor' : 0}\n","diabetic_map = {'Yes' : 2, 'Yes (during pregnancy)' : 2, 'No, borderline diabetes' : 1, 'No' : 0}\n","age_map = {'18-24' : 20, '25-29' : 25, '30-34' : 30, '35-39' : 35, '40-44' : 40, '45-49' : 45, '50-54' : 50, '55-59' : 55, '60-64' : 60, '65-69' : 65, '70-74' : 70, '75-79' : 75, '80 or older' : 80}\n","columns = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']\n","diabetic_map = {'Yes' : 2, 'Yes (during pregnancy)' : 2, 'No, borderline diabetes' : 1, 'No' : 0}\n","for column in columns:\n"," df[column] = df[column].apply(lambda x : yesno_tolabel(x))\n","\n","df['Sex'] = df['Sex'].map(sex_map)\n","df['GenHealth'] = df['GenHealth'].map(health_map)\n","df['Diabetic'] = df['Diabetic'].map(diabetic_map)\n","df['AgeCategory'] = df['AgeCategory'].map(age_map)"]},{"cell_type":"code","execution_count":128,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":488},"executionInfo":{"elapsed":310,"status":"ok","timestamp":1654852050815,"user":{"displayName":"정태영","userId":"14526705904009737659"},"user_tz":-540},"id":"t8q5dkwhDcK-","outputId":"0e0ca463-aa05-4109-ad0a-76054ef24c86"},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n","0 0 16.60 1 0 0 3.0 \n","1 0 20.34 0 0 1 0.0 \n","2 0 26.58 1 0 0 20.0 \n","3 0 24.21 0 0 0 0.0 \n","4 0 23.71 0 0 0 28.0 \n","... ... ... ... ... ... ... \n","319790 1 27.41 1 0 0 7.0 \n","319791 0 29.84 1 0 0 0.0 \n","319792 0 24.24 0 0 0 0.0 \n","319793 0 32.81 0 0 0 0.0 \n","319794 0 46.56 0 0 0 0.0 \n","\n"," MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n","0 30.0 0 0 55 White 2 \n","1 0.0 0 0 80 White 0 \n","2 30.0 0 1 65 White 2 \n","3 0.0 0 0 75 White 0 \n","4 0.0 1 0 40 White 0 \n","... ... ... ... ... ... ... \n","319790 0.0 1 1 60 Hispanic 2 \n","319791 0.0 0 1 35 Hispanic 0 \n","319792 0.0 0 0 45 Hispanic 0 \n","319793 0.0 0 0 25 Hispanic 0 \n","319794 0.0 0 0 80 Hispanic 0 \n","\n"," PhysicalActivity GenHealth SleepTime Asthma KidneyDisease \\\n","0 1 3 5.0 1 0 \n","1 1 3 7.0 0 0 \n","2 1 1 8.0 1 0 \n","3 0 2 6.0 0 0 \n","4 1 3 8.0 0 0 \n","... ... ... ... ... ... \n","319790 0 1 6.0 1 0 \n","319791 1 3 5.0 1 0 \n","319792 1 2 6.0 0 0 \n","319793 0 2 12.0 0 0 \n","319794 1 2 8.0 0 0 \n","\n"," SkinCancer \n","0 1 \n","1 0 \n","2 0 \n","3 1 \n","4 0 \n","... ... \n","319790 0 \n","319791 0 \n","319792 0 \n","319793 0 \n","319794 0 \n","\n","[301653 rows x 18 columns]"],"text/html":["\n","
| \n"," | HeartDisease | \n","BMI | \n","Smoking | \n","AlcoholDrinking | \n","Stroke | \n","PhysicalHealth | \n","MentalHealth | \n","DiffWalking | \n","Sex | \n","AgeCategory | \n","Race | \n","Diabetic | \n","PhysicalActivity | \n","GenHealth | \n","SleepTime | \n","Asthma | \n","KidneyDisease | \n","SkinCancer | \n","
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n","0 | \n","16.60 | \n","1 | \n","0 | \n","0 | \n","3.0 | \n","30.0 | \n","0 | \n","0 | \n","55 | \n","White | \n","2 | \n","1 | \n","3 | \n","5.0 | \n","1 | \n","0 | \n","1 | \n","
| 1 | \n","0 | \n","20.34 | \n","0 | \n","0 | \n","1 | \n","0.0 | \n","0.0 | \n","0 | \n","0 | \n","80 | \n","White | \n","0 | \n","1 | \n","3 | \n","7.0 | \n","0 | \n","0 | \n","0 | \n","
| 2 | \n","0 | \n","26.58 | \n","1 | \n","0 | \n","0 | \n","20.0 | \n","30.0 | \n","0 | \n","1 | \n","65 | \n","White | \n","2 | \n","1 | \n","1 | \n","8.0 | \n","1 | \n","0 | \n","0 | \n","
| 3 | \n","0 | \n","24.21 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0 | \n","0 | \n","75 | \n","White | \n","0 | \n","0 | \n","2 | \n","6.0 | \n","0 | \n","0 | \n","1 | \n","
| 4 | \n","0 | \n","23.71 | \n","0 | \n","0 | \n","0 | \n","28.0 | \n","0.0 | \n","1 | \n","0 | \n","40 | \n","White | \n","0 | \n","1 | \n","3 | \n","8.0 | \n","0 | \n","0 | \n","0 | \n","
| ... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","
| 319790 | \n","1 | \n","27.41 | \n","1 | \n","0 | \n","0 | \n","7.0 | \n","0.0 | \n","1 | \n","1 | \n","60 | \n","Hispanic | \n","2 | \n","0 | \n","1 | \n","6.0 | \n","1 | \n","0 | \n","0 | \n","
| 319791 | \n","0 | \n","29.84 | \n","1 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0 | \n","1 | \n","35 | \n","Hispanic | \n","0 | \n","1 | \n","3 | \n","5.0 | \n","1 | \n","0 | \n","0 | \n","
| 319792 | \n","0 | \n","24.24 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0 | \n","0 | \n","45 | \n","Hispanic | \n","0 | \n","1 | \n","2 | \n","6.0 | \n","0 | \n","0 | \n","0 | \n","
| 319793 | \n","0 | \n","32.81 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0 | \n","0 | \n","25 | \n","Hispanic | \n","0 | \n","0 | \n","2 | \n","12.0 | \n","0 | \n","0 | \n","0 | \n","
| 319794 | \n","0 | \n","46.56 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0 | \n","0 | \n","80 | \n","Hispanic | \n","0 | \n","1 | \n","2 | \n","8.0 | \n","0 | \n","0 | \n","0 | \n","
301653 rows × 18 columns
\n","| \n"," | HeartDisease | \n","BMI | \n","Smoking | \n","AlcoholDrinking | \n","Stroke | \n","PhysicalHealth | \n","MentalHealth | \n","DiffWalking | \n","Sex | \n","AgeCategory | \n","... | \n","SleepTime | \n","Asthma | \n","KidneyDisease | \n","SkinCancer | \n","(American Indian/Alaskan Native,) | \n","(Asian,) | \n","(Black,) | \n","(Hispanic,) | \n","(Other,) | \n","(White,) | \n","
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n","0 | \n","1.403407 | \n","1 | \n","0 | \n","0 | \n","0.669364 | \n","1.089487 | \n","0 | \n","0 | \n","55 | \n","... | \n","3.488639 | \n","1 | \n","0 | \n","1 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","
| 1 | \n","0 | \n","1.442302 | \n","0 | \n","0 | \n","1 | \n","0.000000 | \n","0.000000 | \n","0 | \n","0 | \n","80 | \n","... | \n","5.054525 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","
| 2 | \n","0 | \n","1.487128 | \n","1 | \n","0 | \n","0 | \n","0.801003 | \n","1.089487 | \n","0 | \n","1 | \n","65 | \n","... | \n","5.813435 | \n","1 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","
| 3 | \n","0 | \n","1.472250 | \n","0 | \n","0 | \n","0 | \n","0.000000 | \n","0.000000 | \n","0 | \n","0 | \n","75 | \n","... | \n","4.280563 | \n","0 | \n","0 | \n","1 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","
| 4 | \n","0 | \n","1.468816 | \n","0 | \n","0 | \n","0 | \n","0.807551 | \n","0.000000 | \n","1 | \n","0 | \n","40 | \n","... | \n","5.813435 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","
| ... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","... | \n","
| 301648 | \n","1 | \n","1.491857 | \n","1 | \n","0 | \n","0 | \n","0.755892 | \n","0.000000 | \n","1 | \n","1 | \n","60 | \n","... | \n","4.280563 | \n","1 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","0.0 | \n","0.0 | \n","
| 301649 | \n","0 | \n","1.504497 | \n","1 | \n","0 | \n","0 | \n","0.000000 | \n","0.000000 | \n","0 | \n","1 | \n","35 | \n","... | \n","3.488639 | \n","1 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","0.0 | \n","0.0 | \n","
| 301650 | \n","0 | \n","1.472452 | \n","0 | \n","0 | \n","0 | \n","0.000000 | \n","0.000000 | \n","0 | \n","0 | \n","45 | \n","... | \n","4.280563 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","0.0 | \n","0.0 | \n","
| 301651 | \n","0 | \n","1.517915 | \n","0 | \n","0 | \n","0 | \n","0.000000 | \n","0.000000 | \n","0 | \n","0 | \n","25 | \n","... | \n","8.734516 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","0.0 | \n","0.0 | \n","
| 301652 | \n","0 | \n","1.561599 | \n","0 | \n","0 | \n","0 | \n","0.000000 | \n","0.000000 | \n","0 | \n","0 | \n","80 | \n","... | \n","5.813435 | \n","0 | \n","0 | \n","0 | \n","0.0 | \n","0.0 | \n","0.0 | \n","1.0 | \n","0.0 | \n","0.0 | \n","
301653 rows × 23 columns
\n","