{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 處理 outliers\n", "* 新增欄位註記\n", "* outliers 或 NA 填補\n", " 1. 平均數 (mean)\n", " 2. 中位數 (median, or Q50)\n", " 3. 最大/最小值 (max/min, Q100, Q0)\n", " 4. 分位數 (quantile)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import 需要的套件\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "%matplotlib inline\n", "\n", "# 設定 data_path\n", "dir_data = './data/'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Path of read in data: ./data/application_train.csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGETNAME_CONTRACT_TYPECODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALAMT_CREDITAMT_ANNUITY...FLAG_DOCUMENT_18FLAG_DOCUMENT_19FLAG_DOCUMENT_20FLAG_DOCUMENT_21AMT_REQ_CREDIT_BUREAU_HOURAMT_REQ_CREDIT_BUREAU_DAYAMT_REQ_CREDIT_BUREAU_WEEKAMT_REQ_CREDIT_BUREAU_MONAMT_REQ_CREDIT_BUREAU_QRTAMT_REQ_CREDIT_BUREAU_YEAR
01000021Cash loansMNY0202500.0406597.524700.5...00000.00.00.00.00.01.0
11000030Cash loansFNN0270000.01293502.535698.5...00000.00.00.00.00.00.0
21000040Revolving loansMYY067500.0135000.06750.0...00000.00.00.00.00.00.0
31000060Cash loansFNY0135000.0312682.529686.5...0000NaNNaNNaNNaNNaNNaN
41000070Cash loansMNY0121500.0513000.021865.5...00000.00.00.00.00.00.0
\n", "

5 rows × 122 columns

\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR \\\n", "0 100002 1 Cash loans M N \n", "1 100003 0 Cash loans F N \n", "2 100004 0 Revolving loans M Y \n", "3 100006 0 Cash loans F N \n", "4 100007 0 Cash loans M N \n", "\n", " FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY \\\n", "0 Y 0 202500.0 406597.5 24700.5 \n", "1 N 0 270000.0 1293502.5 35698.5 \n", "2 Y 0 67500.0 135000.0 6750.0 \n", "3 Y 0 135000.0 312682.5 29686.5 \n", "4 Y 0 121500.0 513000.0 21865.5 \n", "\n", " ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 \\\n", "0 ... 0 0 0 0 \n", "1 ... 0 0 0 0 \n", "2 ... 0 0 0 0 \n", "3 ... 0 0 0 0 \n", "4 ... 0 0 0 0 \n", "\n", " AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY \\\n", "0 0.0 0.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 NaN NaN \n", "4 0.0 0.0 \n", "\n", " AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON \\\n", "0 0.0 0.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 NaN NaN \n", "4 0.0 0.0 \n", "\n", " AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR \n", "0 0.0 1.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 NaN NaN \n", "4 0.0 0.0 \n", "\n", "[5 rows x 122 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f_app = os.path.join(dir_data, 'application_train.csv')\n", "print('Path of read in data: %s' % (f_app))\n", "app_train = pd.read_csv(f_app)\n", "app_train.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 265992.000000\n", "mean 0.006402\n", "std 0.083849\n", "min 0.000000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 0.000000\n", "max 4.000000\n", "Name: AMT_REQ_CREDIT_BUREAU_HOUR, dtype: float64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "app_train['AMT_REQ_CREDIT_BUREAU_HOUR'].describe()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 307499.000000\n", "mean 27108.573909\n", "std 14493.737315\n", "min 1615.500000\n", "25% 16524.000000\n", "50% 24903.000000\n", "75% 34596.000000\n", "max 258025.500000\n", "Name: AMT_ANNUITY, dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 如果欄位中有 NA, describe 會有問題\n", "app_train['AMT_ANNUITY'].describe()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1615.5, 16524.0, 24903.0, 34596.0, 258025.5]\n" ] } ], "source": [ "# Ignore NA, 計算五值\n", "five_num = [0, 25, 50, 75, 100]\n", "quantile_5s = [np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = i) for i in five_num]\n", "print(quantile_5s)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAFBlJREFUeJzt3X+sZHV5x/H300WoQS2LlJsNu+nFZv+QlhTxBtbQmFtNlwX/WEywgZCyKMkaC6kmNHGxf0BEE2yiJqSKrnEjNFag/ggbXbvdUCaNiSBLS4EV6V7pVq67YUMXkdWkdu3TP+Z78bDfmXtn7t57Z+7M+5VM5swz33PmPHuG++H8mJnITCRJavqtQa+AJGn4GA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqnDboFVisc845JycnJ/ua5xe/+AVnnnnm8qzQkBmnXmG8+rXX0bUS/T7++OMvZubvLjRu1YbD5OQk+/fv72ueVqvF9PT08qzQkBmnXmG8+rXX0bUS/UbEf/UyzsNKkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqTKqv2E9GoxueM7r04fuvM9A1wTSeqdew6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqLBgOEbEhIh6OiGci4kBEfLjUb4+In0bEE+V2ZWOeWyNiJiKejYjLG/UtpTYTETsa9fMj4tGIOBgR90fE6UvdqCSpd73sOZwAbsnMtwKbgJsi4oLy3Gcz86Jy2wNQnrsG+ANgC/D5iFgTEWuAzwFXABcA1zaW86myrI3AS8CNS9SfJGkRFgyHzDySmf9apl8BngHOm2eWrcB9mfk/mfmfwAxwSbnNZOZzmfkr4D5ga0QE8C7g62X+e4CrFtuQJOnU9fV7DhExCbwNeBS4DLg5Iq4H9tPeu3iJdnA80phtlt+EyfMn1S8F3gz8LDNPdBi/avi7DZJGSc/hEBFvAL4BfCQzfx4RdwN3AFnuPw18AIgOsyed91JynvGd1mE7sB1gYmKCVqvV6+oDcPz48b7n6dUtF554dbr5Gt3qy205ex1G49SvvY6uYeq3p3CIiNfRDoavZuY3ATLzhcbzXwK+XR7OAhsas68HDpfpTvUXgbMi4rSy99Ac/xqZuRPYCTA1NZXT09O9rP6rWq0W/c7Tqxuaew7XTS9YX27L2eswGqd+7XV0DVO/vVytFMCXgWcy8zON+rrGsPcCT5fp3cA1EXFGRJwPbAR+ADwGbCxXJp1O+6T17sxM4GHg6jL/NuDBU2tLknQqetlzuAz4c+CpiHii1D5G+2qji2gfAjoEfBAgMw9ExAPAD2lf6XRTZv4aICJuBvYCa4BdmXmgLO+jwH0R8Qng32iHkSRpQBYMh8z8Hp3PC+yZZ55PAp/sUN/Tab7MfI721UySpCHgJ6QlSRXDQZJUMRwkSZW+PgSn3jQ/ECdJq5F7DpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkip9z6IE/5CNp3LjnIEmquOfQJ/ciJI0D9xwkSRXDQZJUMRwkSRXDQZJUMRwkSRXDQZJUMRwkSRXDQZJUMRwkSRU/IT0gftJa0jBzz0GSVDEcJEkVw0GSVDEcJEkVw0GSVFkwHCJiQ0Q8HBHPRMSBiPhwqZ8dEfsi4mC5X1vqERF3RcRMRDwZERc3lrWtjD8YEdsa9bdHxFNlnrsiIpaj2aU2ueM7r94kaZT0sudwArglM98KbAJuiogLgB3AQ5m5EXioPAa4AthYbtuBu6EdJsBtwKXAJcBtc4FSxmxvzLfl1FuTJC3WguGQmUcy81/L9CvAM8B5wFbgnjLsHuCqMr0VuDfbHgHOioh1wOXAvsw8lpkvAfuALeW5N2Xm9zMzgXsby5IkDUBf5xwiYhJ4G/AoMJGZR6AdIMC5Zdh5wPON2WZLbb76bIe6JGlAev6EdES8AfgG8JHM/Pk8pwU6PZGLqHdah+20Dz8xMTFBq9VaYK1f6/jx433PA3DLhSf6nqeT5ms3l7mYdVrIYntdrcapX3sdXcPUb0/hEBGvox0MX83Mb5byCxGxLjOPlENDR0t9FtjQmH09cLjUp0+qt0p9fYfxlczcCewEmJqayunp6U7Dumq1WvQ7D8ANS3TC+dB1v3nt5jKb9aWy2F5Xq3Hq115H1zD128vVSgF8GXgmMz/TeGo3MHfF0TbgwUb9+nLV0ibg5XLYaS+wOSLWlhPRm4G95blXImJTea3rG8uSJA1AL3sOlwF/DjwVEU+U2seAO4EHIuJG4CfA+8pze4ArgRngl8D7ATLzWETcATxWxn08M4+V6Q8BXwFeD3y33CRJA7JgOGTm9+h8XgDg3R3GJ3BTl2XtAnZ1qO8H/nChdZEkrQw/IS1JqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqRKzz/2o1M3uUS/CyFJy809B0lSxXCQJFUMB0lSxXCQJFUMB0lSxXCQJFUMB0lSxXCQJFX8EFxD80Nqh+58zwDXRJIGyz0HSVLFcJAkVQwHSVLFcJAkVQwHSVLFq5W68Ou1JY0z9xwkSZUFwyEidkXE0Yh4ulG7PSJ+GhFPlNuVjedujYiZiHg2Ii5v1LeU2kxE7GjUz4+IRyPiYETcHxGnL2WDkqT+9bLn8BVgS4f6ZzPzonLbAxARFwDXAH9Q5vl8RKyJiDXA54ArgAuAa8tYgE+VZW0EXgJuPJWGJEmnbsFwyMx/AY71uLytwH2Z+T+Z+Z/ADHBJuc1k5nOZ+SvgPmBrRATwLuDrZf57gKv67EGStMRO5ZzDzRHxZDnstLbUzgOeb4yZLbVu9TcDP8vMEyfVJUkDtNirle4G7gCy3H8a+AAQHcYmnUMo5xnfUURsB7YDTExM0Gq1+lrp48ePzzvPLRee6Prccuq3j14s1OuoGad+7XV0DVO/iwqHzHxhbjoivgR8uzycBTY0hq4HDpfpTvUXgbMi4rSy99Ac3+l1dwI7AaampnJ6erqv9W61Wsw3zw0Dunz10HXTS77MhXodNePUr72OrmHqd1GHlSJiXePhe4G5K5l2A9dExBkRcT6wEfgB8BiwsVyZdDrtk9a7MzOBh4Gry/zbgAcXs06SpKWz4J5DRHwNmAbOiYhZ4DZgOiIuon0I6BDwQYDMPBARDwA/BE4AN2Xmr8tybgb2AmuAXZl5oLzER4H7IuITwL8BX16y7iRJi7JgOGTmtR3KXf+AZ+YngU92qO8B9nSoP0f7aiZJ0pDw6zOGgD8yJGnY+PUZkqSK4SBJqhgOkqSK4SBJqhgOkqSKVysNmZN/ZMirlyQNgnsOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSKn5Aecv7Wg6RBcM9BklQxHCRJFcNBklQxHCRJFcNBklQxHCRJFcNBklQxHCRJFcNBklQxHCRJFcNBklQxHCRJlQXDISJ2RcTRiHi6UTs7IvZFxMFyv7bUIyLuioiZiHgyIi5uzLOtjD8YEdsa9bdHxFNlnrsiIpa6SUlSf3rZc/gKsOWk2g7goczcCDxUHgNcAWwst+3A3dAOE+A24FLgEuC2uUApY7Y35jv5tSRJK2zBcMjMfwGOnVTeCtxTpu8BrmrU7822R4CzImIdcDmwLzOPZeZLwD5gS3nuTZn5/cxM4N7GsiRJA7LYcw4TmXkEoNyfW+rnAc83xs2W2nz12Q51SdIALfWP/XQ6X5CLqHdeeMR22oegmJiYoNVq9bVyx48fn3eeWy480dfyVlo//S7U66gZp37tdXQNU7+LDYcXImJdZh4ph4aOlvossKExbj1wuNSnT6q3Sn19h/EdZeZOYCfA1NRUTk9PdxvaUavVYr55bmj86towOnTddM9jF+p11IxTv/Y6uoap38UeVtoNzF1xtA14sFG/vly1tAl4uRx22gtsjoi15UT0ZmBvee6ViNhUrlK6vrEsSdKALLjnEBFfo/1//edExCztq47uBB6IiBuBnwDvK8P3AFcCM8AvgfcDZOaxiLgDeKyM+3hmzp3k/hDtK6JeD3y33CRJA7RgOGTmtV2eeneHsQnc1GU5u4BdHer7gT9caD0kSSvHT0hLkiqGgySpYjhIkiqGgySpYjhIkiqGgySpYjhIkipL/d1KWkaTja/3OHTnewa4JpJGnXsOkqSKew6rlHsRkpaTew6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrfrTQC/J4lSUvNPQdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUuWUwiEiDkXEUxHxRETsL7WzI2JfRBws92tLPSLiroiYiYgnI+LixnK2lfEHI2LbqbU03iZ3fIfJHd/hqZ++POhVkbSKLcWew59k5kWZOVUe7wAeysyNwEPlMcAVwMZy2w7cDe0wAW4DLgUuAW6bCxRJ0mAsx2GlrcA9Zfoe4KpG/d5sewQ4KyLWAZcD+zLzWGa+BOwDtizDekmSenSq4ZDAP0XE4xGxvdQmMvMIQLk/t9TPA55vzDtbat3qkqQBOdXvVrosMw9HxLnAvoj40Txjo0Mt56nXC2gH0HaAiYkJWq1WXyt7/Pjxeee55cITfS1vmE28nr7/fVazhbbtKLHX0TVM/Z5SOGTm4XJ/NCK+RfucwQsRsS4zj5TDRkfL8FlgQ2P29cDhUp8+qd7q8no7gZ0AU1NTOT093WlYV61Wi/nmuaHxBXar3S0XnuDP+vz3Wc0W2rajxF5H1zD1u+jDShFxZkS8cW4a2Aw8DewG5q442gY8WKZ3A9eXq5Y2AS+Xw057gc0RsbaciN5capKkATmVPYcJ4FsRMbecv8/Mf4yIx4AHIuJG4CfA+8r4PcCVwAzwS+D9AJl5LCLuAB4r4z6emcdOYb1U+FXekhZr0eGQmc8Bf9Sh/t/AuzvUE7ipy7J2AbsWuy6SpKXlJ6QlSRXDQZJUMRwkSRXDQZJUMRwkSZVT/YT0qjc5Qh98k6SlMvbhMC78zIOkfnhYSZJUMRwkSRXDQZJU8ZzDGPL8g6SFuOcgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkih+C06v8cJykOYbDmPMryyV14mElSVLFPQctyMNN0vhxz0GSVDEcJEkVDyupLx5iksaDew6SpIp7DurIS1yl8WY4aNE8xCSNrqE5rBQRWyLi2YiYiYgdg14fSRpnQ7HnEBFrgM8BfwrMAo9FxO7M/OFyvJ6HTJaeexHSaBmKcAAuAWYy8zmAiLgP2AosSzhoefUSvgaINNyGJRzOA55vPJ4FLh3QumgFLOXeWzNouu3BGFhSf4YlHKJDLatBEduB7eXh8Yh4ts/XOQd4sc95VqW/HKNe41NAh35Lvd/lrAZjs20Zr15hZfr9vV4GDUs4zAIbGo/XA4dPHpSZO4Gdi32RiNifmVOLnX81GadeYbz6tdfRNUz9DsvVSo8BGyPi/Ig4HbgG2D3gdZKksTUUew6ZeSIibgb2AmuAXZl5YMCrJUljayjCASAz9wB7lvllFn1IahUap15hvPq119E1NP1GZnXeV5I05oblnIMkaYiMTTis1q/niIhDEfFURDwREftL7eyI2BcRB8v92lKPiLir9PhkRFzcWM62Mv5gRGxr1N9elj9T5u10WfFy9rcrIo5GxNON2rL31+01BtDr7RHx07J9n4iIKxvP3VrW+9mIuLxR7/heLhd0PFp6ur9c3EFEnFEez5TnJ1eg1w0R8XBEPBMRByLiw6U+qtu2W7+rd/tm5sjfaJ/k/jHwFuB04N+BCwa9Xj2u+yHgnJNqfwPsKNM7gE+V6SuB79L+3Mgm4NFSPxt4rtyvLdNry3M/AN5R5vkucMUK9/dO4GLg6ZXsr9trDKDX24G/6jD2gvI+PQM4v7x/18z3XgYeAK4p018APlSm/wL4Qpm+Brh/BXpdB1xcpt8I/EfpaVS3bbd+V+32XbE/AoO8lTfQ3sbjW4FbB71ePa77IepweBZYV6bXAc+W6S8C1548DrgW+GKj/sVSWwf8qFF/zbgV7HGS1/7BXPb+ur3GAHrt9sfjNe9R2lfyvaPbe7n8gXwROK3UXx03N2+ZPq2MixXexg/S/u60kd22Xfpdtdt3XA4rdfp6jvMGtC79SuCfIuLxaH9CHGAiM48AlPtzS71bn/PVZzvUB20l+uv2GoNwczmUsqtxCKTfXt8M/CwzT5xUf82yyvMvl/ErohzmeBvwKGOwbU/qF1bp9h2XcOjp6zmG1GWZeTFwBXBTRLxznrHd+uy3PqxGsb+7gd8HLgKOAJ8u9aXsdWD/DhHxBuAbwEcy8+fzDe1QW3XbtkO/q3b7jks49PT1HMMoMw+X+6PAt2h/g+0LEbEOoNwfLcO79TlffX2H+qCtRH/dXmNFZeYLmfnrzPw/4Eu0ty/03+uLwFkRcdpJ9dcsqzz/O8Cxpe/mtSLidbT/UH41M79ZyiO7bTv1u5q377iEw6r8eo6IODMi3jg3DWwGnqa97nNXbWyjfXyTUr++XPmxCXi57FbvBTZHxNqyW7uZ9vHKI8ArEbGpXOlxfWNZg7QS/XV7jRU190eseC/t7Qvt9bumXIlyPrCR9gnYju/lbB9wfhi4usx/8r/bXK9XA/9cxi+b8u/9ZeCZzPxM46mR3Lbd+l3V23elT9QM6kb7aoj/oH0lwF8Pen16XOe30L5a4d+BA3PrTft44kPAwXJ/dqkH7R9N+jHwFDDVWNYHgJlye3+jPlXesD8G/paVP1H5Ndq72/9L+/+AblyJ/rq9xgB6/bvSy5O0/yNf1xj/12W9n6VxFVm393J5v/yg/Bv8A3BGqf92eTxTnn/LCvT6x7QPbTwJPFFuV47wtu3W76rdvn5CWpJUGZfDSpKkPhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqTK/wMa8bDNlfPtDAAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#'~' 波浪符號為取反的意思\n", "app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'].hist(bins = 100)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAFBlJREFUeJzt3X+sZHV5x/H300WoQS2LlJsNu+nFZv+QlhTxBtbQmFtNlwX/WEywgZCyKMkaC6kmNHGxf0BEE2yiJqSKrnEjNFag/ggbXbvdUCaNiSBLS4EV6V7pVq67YUMXkdWkdu3TP+Z78bDfmXtn7t57Z+7M+5VM5swz33PmPHuG++H8mJnITCRJavqtQa+AJGn4GA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqnDboFVisc845JycnJ/ua5xe/+AVnnnnm8qzQkBmnXmG8+rXX0bUS/T7++OMvZubvLjRu1YbD5OQk+/fv72ueVqvF9PT08qzQkBmnXmG8+rXX0bUS/UbEf/UyzsNKkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqTKqv2E9GoxueM7r04fuvM9A1wTSeqdew6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqLBgOEbEhIh6OiGci4kBEfLjUb4+In0bEE+V2ZWOeWyNiJiKejYjLG/UtpTYTETsa9fMj4tGIOBgR90fE6UvdqCSpd73sOZwAbsnMtwKbgJsi4oLy3Gcz86Jy2wNQnrsG+ANgC/D5iFgTEWuAzwFXABcA1zaW86myrI3AS8CNS9SfJGkRFgyHzDySmf9apl8BngHOm2eWrcB9mfk/mfmfwAxwSbnNZOZzmfkr4D5ga0QE8C7g62X+e4CrFtuQJOnU9fV7DhExCbwNeBS4DLg5Iq4H9tPeu3iJdnA80phtlt+EyfMn1S8F3gz8LDNPdBi/avi7DZJGSc/hEBFvAL4BfCQzfx4RdwN3AFnuPw18AIgOsyed91JynvGd1mE7sB1gYmKCVqvV6+oDcPz48b7n6dUtF554dbr5Gt3qy205ex1G49SvvY6uYeq3p3CIiNfRDoavZuY3ATLzhcbzXwK+XR7OAhsas68HDpfpTvUXgbMi4rSy99Ac/xqZuRPYCTA1NZXT09O9rP6rWq0W/c7Tqxuaew7XTS9YX27L2eswGqd+7XV0DVO/vVytFMCXgWcy8zON+rrGsPcCT5fp3cA1EXFGRJwPbAR+ADwGbCxXJp1O+6T17sxM4GHg6jL/NuDBU2tLknQqetlzuAz4c+CpiHii1D5G+2qji2gfAjoEfBAgMw9ExAPAD2lf6XRTZv4aICJuBvYCa4BdmXmgLO+jwH0R8Qng32iHkSRpQBYMh8z8Hp3PC+yZZ55PAp/sUN/Tab7MfI721UySpCHgJ6QlSRXDQZJUMRwkSZW+PgSn3jQ/ECdJq5F7DpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkip9z6IE/5CNp3LjnIEmquOfQJ/ciJI0D9xwkSRXDQZJUMRwkSRXDQZJUMRwkSRXDQZJUMRwkSRXDQZJUMRwkSRU/IT0gftJa0jBzz0GSVDEcJEkVw0GSVDEcJEkVw0GSVFkwHCJiQ0Q8HBHPRMSBiPhwqZ8dEfsi4mC5X1vqERF3RcRMRDwZERc3lrWtjD8YEdsa9bdHxFNlnrsiIpaj2aU2ueM7r94kaZT0sudwArglM98KbAJuiogLgB3AQ5m5EXioPAa4AthYbtuBu6EdJsBtwKXAJcBtc4FSxmxvzLfl1FuTJC3WguGQmUcy81/L9CvAM8B5wFbgnjLsHuCqMr0VuDfbHgHOioh1wOXAvsw8lpkvAfuALeW5N2Xm9zMzgXsby5IkDUBf5xwiYhJ4G/AoMJGZR6AdIMC5Zdh5wPON2WZLbb76bIe6JGlAev6EdES8AfgG8JHM/Pk8pwU6PZGLqHdah+20Dz8xMTFBq9VaYK1f6/jx433PA3DLhSf6nqeT5ms3l7mYdVrIYntdrcapX3sdXcPUb0/hEBGvox0MX83Mb5byCxGxLjOPlENDR0t9FtjQmH09cLjUp0+qt0p9fYfxlczcCewEmJqayunp6U7Dumq1WvQ7D8ANS3TC+dB1v3nt5jKb9aWy2F5Xq3Hq115H1zD128vVSgF8GXgmMz/TeGo3MHfF0TbgwUb9+nLV0ibg5XLYaS+wOSLWlhPRm4G95blXImJTea3rG8uSJA1AL3sOlwF/DjwVEU+U2seAO4EHIuJG4CfA+8pze4ArgRngl8D7ATLzWETcATxWxn08M4+V6Q8BXwFeD3y33CRJA7JgOGTm9+h8XgDg3R3GJ3BTl2XtAnZ1qO8H/nChdZEkrQw/IS1JqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqRKzz/2o1M3uUS/CyFJy809B0lSxXCQJFUMB0lSxXCQJFUMB0lSxXCQJFUMB0lSxXCQJFX8EFxD80Nqh+58zwDXRJIGyz0HSVLFcJAkVQwHSVLFcJAkVQwHSVLFq5W68Ou1JY0z9xwkSZUFwyEidkXE0Yh4ulG7PSJ+GhFPlNuVjedujYiZiHg2Ii5v1LeU2kxE7GjUz4+IRyPiYETcHxGnL2WDkqT+9bLn8BVgS4f6ZzPzonLbAxARFwDXAH9Q5vl8RKyJiDXA54ArgAuAa8tYgE+VZW0EXgJuPJWGJEmnbsFwyMx/AY71uLytwH2Z+T+Z+Z/ADHBJuc1k5nOZ+SvgPmBrRATwLuDrZf57gKv67EGStMRO5ZzDzRHxZDnstLbUzgOeb4yZLbVu9TcDP8vMEyfVJUkDtNirle4G7gCy3H8a+AAQHcYmnUMo5xnfUURsB7YDTExM0Gq1+lrp48ePzzvPLRee6Prccuq3j14s1OuoGad+7XV0DVO/iwqHzHxhbjoivgR8uzycBTY0hq4HDpfpTvUXgbMi4rSy99Ac3+l1dwI7AaampnJ6erqv9W61Wsw3zw0Dunz10HXTS77MhXodNePUr72OrmHqd1GHlSJiXePhe4G5K5l2A9dExBkRcT6wEfgB8BiwsVyZdDrtk9a7MzOBh4Gry/zbgAcXs06SpKWz4J5DRHwNmAbOiYhZ4DZgOiIuon0I6BDwQYDMPBARDwA/BE4AN2Xmr8tybgb2AmuAXZl5oLzER4H7IuITwL8BX16y7iRJi7JgOGTmtR3KXf+AZ+YngU92qO8B9nSoP0f7aiZJ0pDw6zOGgD8yJGnY+PUZkqSK4SBJqhgOkqSK4SBJqhgOkqSKVysNmZN/ZMirlyQNgnsOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSKn5Aecv7Wg6RBcM9BklQxHCRJFcNBklQxHCRJFcNBklQxHCRJFcNBklQxHCRJFcNBklQxHCRJFcNBklQxHCRJlQXDISJ2RcTRiHi6UTs7IvZFxMFyv7bUIyLuioiZiHgyIi5uzLOtjD8YEdsa9bdHxFNlnrsiIpa6SUlSf3rZc/gKsOWk2g7goczcCDxUHgNcAWwst+3A3dAOE+A24FLgEuC2uUApY7Y35jv5tSRJK2zBcMjMfwGOnVTeCtxTpu8BrmrU7822R4CzImIdcDmwLzOPZeZLwD5gS3nuTZn5/cxM4N7GsiRJA7LYcw4TmXkEoNyfW+rnAc83xs2W2nz12Q51SdIALfWP/XQ6X5CLqHdeeMR22oegmJiYoNVq9bVyx48fn3eeWy480dfyVlo//S7U66gZp37tdXQNU7+LDYcXImJdZh4ph4aOlvossKExbj1wuNSnT6q3Sn19h/EdZeZOYCfA1NRUTk9PdxvaUavVYr55bmj86towOnTddM9jF+p11IxTv/Y6uoap38UeVtoNzF1xtA14sFG/vly1tAl4uRx22gtsjoi15UT0ZmBvee6ViNhUrlK6vrEsSdKALLjnEBFfo/1//edExCztq47uBB6IiBuBnwDvK8P3AFcCM8AvgfcDZOaxiLgDeKyM+3hmzp3k/hDtK6JeD3y33CRJA7RgOGTmtV2eeneHsQnc1GU5u4BdHer7gT9caD0kSSvHT0hLkiqGgySpYjhIkiqGgySpYjhIkiqGgySpYjhIkipL/d1KWkaTja/3OHTnewa4JpJGnXsOkqSKew6rlHsRkpaTew6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrfrTQC/J4lSUvNPQdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUuWUwiEiDkXEUxHxRETsL7WzI2JfRBws92tLPSLiroiYiYgnI+LixnK2lfEHI2LbqbU03iZ3fIfJHd/hqZ++POhVkbSKLcWew59k5kWZOVUe7wAeysyNwEPlMcAVwMZy2w7cDe0wAW4DLgUuAW6bCxRJ0mAsx2GlrcA9Zfoe4KpG/d5sewQ4KyLWAZcD+zLzWGa+BOwDtizDekmSenSq4ZDAP0XE4xGxvdQmMvMIQLk/t9TPA55vzDtbat3qkqQBOdXvVrosMw9HxLnAvoj40Txjo0Mt56nXC2gH0HaAiYkJWq1WXyt7/Pjxeee55cITfS1vmE28nr7/fVazhbbtKLHX0TVM/Z5SOGTm4XJ/NCK+RfucwQsRsS4zj5TDRkfL8FlgQ2P29cDhUp8+qd7q8no7gZ0AU1NTOT093WlYV61Wi/nmuaHxBXar3S0XnuDP+vz3Wc0W2rajxF5H1zD1u+jDShFxZkS8cW4a2Aw8DewG5q442gY8WKZ3A9eXq5Y2AS+Xw057gc0RsbaciN5capKkATmVPYcJ4FsRMbecv8/Mf4yIx4AHIuJG4CfA+8r4PcCVwAzwS+D9AJl5LCLuAB4r4z6emcdOYb1U+FXekhZr0eGQmc8Bf9Sh/t/AuzvUE7ipy7J2AbsWuy6SpKXlJ6QlSRXDQZJUMRwkSRXDQZJUMRwkSZVT/YT0qjc5Qh98k6SlMvbhMC78zIOkfnhYSZJUMRwkSRXDQZJU8ZzDGPL8g6SFuOcgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkih+C06v8cJykOYbDmPMryyV14mElSVLFPQctyMNN0vhxz0GSVDEcJEkVDyupLx5iksaDew6SpIp7DurIS1yl8WY4aNE8xCSNrqE5rBQRWyLi2YiYiYgdg14fSRpnQ7HnEBFrgM8BfwrMAo9FxO7M/OFyvJ6HTJaeexHSaBmKcAAuAWYy8zmAiLgP2AosSzhoefUSvgaINNyGJRzOA55vPJ4FLh3QumgFLOXeWzNouu3BGFhSf4YlHKJDLatBEduB7eXh8Yh4ts/XOQd4sc95VqW/HKNe41NAh35Lvd/lrAZjs20Zr15hZfr9vV4GDUs4zAIbGo/XA4dPHpSZO4Gdi32RiNifmVOLnX81GadeYbz6tdfRNUz9DsvVSo8BGyPi/Ig4HbgG2D3gdZKksTUUew6ZeSIibgb2AmuAXZl5YMCrJUljayjCASAz9wB7lvllFn1IahUap15hvPq119E1NP1GZnXeV5I05oblnIMkaYiMTTis1q/niIhDEfFURDwREftL7eyI2BcRB8v92lKPiLir9PhkRFzcWM62Mv5gRGxr1N9elj9T5u10WfFy9rcrIo5GxNON2rL31+01BtDr7RHx07J9n4iIKxvP3VrW+9mIuLxR7/heLhd0PFp6ur9c3EFEnFEez5TnJ1eg1w0R8XBEPBMRByLiw6U+qtu2W7+rd/tm5sjfaJ/k/jHwFuB04N+BCwa9Xj2u+yHgnJNqfwPsKNM7gE+V6SuB79L+3Mgm4NFSPxt4rtyvLdNry3M/AN5R5vkucMUK9/dO4GLg6ZXsr9trDKDX24G/6jD2gvI+PQM4v7x/18z3XgYeAK4p018APlSm/wL4Qpm+Brh/BXpdB1xcpt8I/EfpaVS3bbd+V+32XbE/AoO8lTfQ3sbjW4FbB71ePa77IepweBZYV6bXAc+W6S8C1548DrgW+GKj/sVSWwf8qFF/zbgV7HGS1/7BXPb+ur3GAHrt9sfjNe9R2lfyvaPbe7n8gXwROK3UXx03N2+ZPq2MixXexg/S/u60kd22Xfpdtdt3XA4rdfp6jvMGtC79SuCfIuLxaH9CHGAiM48AlPtzS71bn/PVZzvUB20l+uv2GoNwczmUsqtxCKTfXt8M/CwzT5xUf82yyvMvl/ErohzmeBvwKGOwbU/qF1bp9h2XcOjp6zmG1GWZeTFwBXBTRLxznrHd+uy3PqxGsb+7gd8HLgKOAJ8u9aXsdWD/DhHxBuAbwEcy8+fzDe1QW3XbtkO/q3b7jks49PT1HMMoMw+X+6PAt2h/g+0LEbEOoNwfLcO79TlffX2H+qCtRH/dXmNFZeYLmfnrzPw/4Eu0ty/03+uLwFkRcdpJ9dcsqzz/O8Cxpe/mtSLidbT/UH41M79ZyiO7bTv1u5q377iEw6r8eo6IODMi3jg3DWwGnqa97nNXbWyjfXyTUr++XPmxCXi57FbvBTZHxNqyW7uZ9vHKI8ArEbGpXOlxfWNZg7QS/XV7jRU190eseC/t7Qvt9bumXIlyPrCR9gnYju/lbB9wfhi4usx/8r/bXK9XA/9cxi+b8u/9ZeCZzPxM46mR3Lbd+l3V23elT9QM6kb7aoj/oH0lwF8Pen16XOe30L5a4d+BA3PrTft44kPAwXJ/dqkH7R9N+jHwFDDVWNYHgJlye3+jPlXesD8G/paVP1H5Ndq72/9L+/+AblyJ/rq9xgB6/bvSy5O0/yNf1xj/12W9n6VxFVm393J5v/yg/Bv8A3BGqf92eTxTnn/LCvT6x7QPbTwJPFFuV47wtu3W76rdvn5CWpJUGZfDSpKkPhgOkqSK4SBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqTK/wMa8bDNlfPtDAAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#等價上一行代碼\n", "app_train[app_train['AMT_ANNUITY'].notnull()]['AMT_ANNUITY'].hist(bins = 100)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# 試著將 max 取代為 q99\n", "app_train[app_train['AMT_ANNUITY'] == app_train['AMT_ANNUITY'].max()] = np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = 99)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "70006.5" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = 99)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1615.5, 16524.0, 24903.0, 34596.0, 230161.5]\n" ] } ], "source": [ "five_num = [0, 25, 50, 75, 100]\n", "quantile_5s = [np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = i) for i in five_num]\n", "print(quantile_5s)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "可以看到原本的\n", "
max 258025.5\n", "
被取代後變為\n", "
max 230161.5" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "24903.0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 得到 median 的另外一種方法\n", "np.median(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ModeResult(mode=array([9000.]), count=array([6385]))\n", "Elapsed time: 5.396 secs\n" ] } ], "source": [ "# 計算眾數 (mode)\n", "from scipy.stats import mode\n", "import time\n", "\n", "start_time = time.time()\n", "mode_get = mode(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'])\n", "print(mode_get)\n", "print(\"Elapsed time: %.3f secs\" % (time.time() - start_time))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(9000.0, 6385)\n", "Elapsed time: 0.300 secs\n" ] } ], "source": [ "# 計算眾數 (mode)\n", "# 較快速的方式\n", "from collections import defaultdict\n", "\n", "start_time = time.time()\n", "mode_dict = defaultdict(lambda:0)\n", "\n", "for value in app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY']:\n", " mode_dict[value] += 1\n", " \n", "mode_get = sorted(mode_dict.items(), key=lambda kv: kv[1], reverse=True)\n", "print(mode_get[0])\n", "print(\"Elapsed time: %.3f secs\" % (time.time() - start_time))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 連續值標準化\n", "### 1. Z-transform: $ \\frac{(x - mean(x))}{std(x)} $\n", "### 2. Range (0 ~ 1): $ \\frac{x - min(x)}{max(x) - min(x)} $\n", "### 3. Range (-1 ~ 1): $ (\\frac{x - min(x)}{max(x) - min(x)} - 0.5) * 2 $" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAEICAYAAAC0+DhzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAFppJREFUeJzt3X+0ZWV93/H3R0bAgAiCThWQ0Timoix/ZJaQH00mYnUEA/6BLYYIWJLJsmhtQ6pYm2BV1sK4jAld/uikEMAYEU0TRsUQKt7aGEEgVnEghAFHmEAlCkwcjeLot3+cZ+hxnnPnnnvn/r7v11p33bOf/ey9n++dO+dzn733OSdVhSRJwx6z0AOQJC0+hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4aMVL8rYkf9wePy3JziT7zfIxtiV5yWzuU5pLhoPmXHti/EaSg4bafi3JxAIOa6SquqeqDq6qH87XMZNcluSd+7D9AUkuTfKPSf5vkt+czfG1Yzw3ybVJvpnEF0etAIaD5ssq4I37upMM+Hv7494GrAWOAX4JeFOSDbN8jB8AVwHnzPJ+tUj5n0zz5d3AbyU5dNTKJD+b5KYkO9r3nx1aN5HkwiSfB74LPKO1vTPJX7fTQJ9IcniSD7e/oG9KsmZoH3+Q5N627pYk/2KScaxJUklWJfmZtu/dX99Lsq31e0yS85PcleRbSa5K8sSh/bwmydfburdO9kNJshE4g8ET+s4kn2jtz241PpxkS5JT9vKzPRN4R1U9VFW3A38InL2X/tNWVXdU1SXAltncrxYvw0Hz5WZgAvitPVe0J9VPARcDhwO/B3wqyeFD3V4DbAQeD3y9tZ3e2o8EfhL4AvBHwBOB24ELhra/CXh+W/cnwMeSHLi3AVfVF9oppoOBw4AbgI+01f8OeCXwi8BTgYeA97V6jgU+0Mb21FbTUZMcYxPwYeB327F+OcljgU8Afwk8GXgD8OEkPzXiZ3dYO8aXh5q/DDxnb7VNJklmsp2WH8NB8+l3gDckedIe7ScDd1bVh6pqV1V9BPhb4JeH+lxWVVva+h+0tj+qqruqagfwaeCuqvqfVbUL+Bjwgt0bV9UfV9W32vbvAQ4AuifbvbgY+A6wexbwG8Bbq2p7VX2fwamd05KsAk4DPllVn2vrfhv40TSOdQJwMHBRVT1SVdcDnwRePaLvwe37jqG2HQxCtNOuT1zUZjzbkrwryXOSHJPkQmDkjEorj+GgeVNVX2XwJHf+Hqueyv+fDez2dQYzgt3uHbHLbww9/qcRy7ufOElyXpLb22mrh4EnAEeMM+4kvwGsB36lqnY/yR8D/Fk77fMwg5nKD4HVrZ5Hx1tV3wG+Nc6xmqcC9w4dC/qfx2472/dDhtoOAb49yb6PZxByxwG/ADzC4N/kegbXFf56GuPUMrZqoQegFecC4G+A9wy13cfgyXbY04C/GFqe8R0y7frCm4ETgS1V9aMkDwFTnkJp274D+Pk2Q9ntXuDfVNXnR2xzP/DsoeWfYHBqaTJ71nYfcHSSxwwFxNOAv+s2rHqoHe95wHWt+XlMfm3gr6rqc+3xPQxmNb+9l7FphXLmoHlVVVuBjzI4Z7/bNcCzkvxKuxD8r4FjGfxFOxseD+wC/gFYleR3+PG/tEdKcnQb65lVtecT8weBC5Mc0/o+Kcmpbd3HgVck+fkk+wNvZ+//174BPGNo+UYGf92/Kcljk6xncIrtykm2vwL4z0kOS/LPgV8HLhvVcY/ZyNjaXWIHAvu35QOTHDCTfWlpMBy0EN4OPPqah6r6FvAK4DwGp1/eBLyiqr45S8e7lsE1ib9jcHrme4w+TbWnE4F/Bnx86I6l3X+R/wGwGfjLJN9mcLH6+FbPFuBcBhe+72dwsXr7Xo5zCXBsO0X151X1CHAK8HLgm8D7GQTU306y/QXAXa22/wW8u6r+YpK+M3UMg1N1u+v/J+COWT6GFpH4YT+SpD05c5AkdQwHSVLHcJAkdQwHSVJnyb7O4Ygjjqg1a9bMaNvvfOc7HHTQQVN3XEaseflbafWCNU/XLbfc8s2q2vMdCkZasuGwZs0abr755hltOzExwfr162d3QIucNS9/K61esObpSrLnOxFMytNKkqSO4SBJ6hgOkqSO4SBJ6hgOkqSO4SBJ6hgOkqSO4SBJ6hgOkqTOkn2F9Hxac/6nRrZvu+jkeR6JJM0PZw6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpM7Y4ZBkvyRfSvLJtvz0JDcmuTPJR5Ps39oPaMtb2/o1Q/t4S2u/I8nLhto3tLatSc6fvfIkSTMxnZnDG4Hbh5bfBby3qtYCDwHntPZzgIeq6pnAe1s/khwLnA48B9gAvL8Fzn7A+4CXA8cCr259JUkLZKxwSHIUcDLw39tygBcDH29dLgde2R6f2pZp609s/U8Frqyq71fV14CtwIva19aquruqHgGubH0lSQtk3JnD7wNvAn7Ulg8HHq6qXW15O3Bke3wkcC9AW7+j9X+0fY9tJmuXJC2QVVN1SPIK4IGquiXJ+t3NI7rWFOsmax8VUDWijSQbgY0Aq1evZmJiYvKB78XOnTunte15x+0a2T7T4y+E6da8HKy0mldavWDNc2nKcAB+DjglyUnAgcAhDGYShyZZ1WYHRwH3tf7bgaOB7UlWAU8AHhxq3214m8naf0xVbQI2Aaxbt67Wr18/xvB7ExMTTGfbs8//1Mj2bWfM7PgLYbo1LwcrreaVVi9Y81ya8rRSVb2lqo6qqjUMLihfX1VnAJ8FTmvdzgKubo83t2Xa+uurqlr76e1upqcDa4EvAjcBa9vdT/u3Y2yeleokSTMyzsxhMm8GrkzyTuBLwCWt/RLgQ0m2MpgxnA5QVVuSXAXcBuwCzq2qHwIkeT1wLbAfcGlVbdmHcUmS9tG0wqGqJoCJ9vhuBnca7dnne8CrJtn+QuDCEe3XANdMZyySpLnjK6QlSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUWbXQA1hJ1pz/qZHt2y46eZ5HIkl758xBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktSZMhySHJjki0m+nGRLkv/S2p+e5MYkdyb5aJL9W/sBbXlrW79maF9vae13JHnZUPuG1rY1yfmzX6YkaTrGmTl8H3hxVT0PeD6wIckJwLuA91bVWuAh4JzW/xzgoap6JvDe1o8kxwKnA88BNgDvT7Jfkv2A9wEvB44FXt36SpIWyJThUAM72+Jj21cBLwY+3tovB17ZHp/almnrT0yS1n5lVX2/qr4GbAVe1L62VtXdVfUIcGXrK0laIGN92E/76/4W4JkM/sq/C3i4qna1LtuBI9vjI4F7AapqV5IdwOGt/Yah3Q5vc+8e7cdPMo6NwEaA1atXMzExMc7wOzt37pzWtucdt2tk+3SPP1v7mYnp1rwcrLSaV1q9YM1zaaxwqKofAs9PcijwZ8CzR3Vr3zPJusnaR81eakQbVbUJ2ASwbt26Wr9+/d4HPomJiQmms+3Zk32C2xnTO/5s7WcmplvzcrDSal5p9YI1z6Vp3a1UVQ8DE8AJwKFJdofLUcB97fF24GiAtv4JwIPD7XtsM1m7JGmBjHO30pPajIEkjwNeAtwOfBY4rXU7C7i6Pd7clmnrr6+qau2nt7uZng6sBb4I3ASsbXc/7c/govXm2ShOkjQz45xWegpwebvu8Bjgqqr6ZJLbgCuTvBP4EnBJ638J8KEkWxnMGE4HqKotSa4CbgN2Aee201UkeT1wLbAfcGlVbZm1CqdhzSSnfSRppZkyHKrqK8ALRrTfzeBOoz3bvwe8apJ9XQhcOKL9GuCaMcYrSZoHvkJaktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJnbE+7EejTfYurtsuOnmeRyJJs8twWAQMGUmLjaeVJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1PFjQhcxPz5U0kJx5iBJ6hgOkqSO4SBJ6kwZDkmOTvLZJLcn2ZLkja39iUmuS3Jn+35Ya0+Si5NsTfKVJC8c2tdZrf+dSc4aav/pJLe2bS5OkrkoVpI0nnFmDruA86rq2cAJwLlJjgXOBz5TVWuBz7RlgJcDa9vXRuADMAgT4ALgeOBFwAW7A6X12Ti03YZ9L02SNFNThkNV3V9Vf9Mefxu4HTgSOBW4vHW7HHhle3wqcEUN3AAcmuQpwMuA66rqwap6CLgO2NDWHVJVX6iqAq4Y2pckaQFM61bWJGuAFwA3Aqur6n4YBEiSJ7duRwL3Dm22vbXtrX37iPZRx9/IYIbB6tWrmZiYmM7wH7Vz586R25533K4Z7W9Pk41rrve/N5PVvJyttJpXWr1gzXNp7HBIcjDwp8C/r6p/3MtlgVEragbtfWPVJmATwLp162r9+vVTjHq0iYkJRm179iSvK5iubWf0+56P/e/NZDUvZyut5pVWL1jzXBrrbqUkj2UQDB+uqv/Rmr/RTgnRvj/Q2rcDRw9tfhRw3xTtR41olyQtkHHuVgpwCXB7Vf3e0KrNwO47js4Crh5qP7PdtXQCsKOdfroWeGmSw9qF6JcC17Z1305yQjvWmUP7kiQtgHFOK/0c8Brg1iT/p7X9J+Ai4Kok5wD3AK9q664BTgK2At8FXgtQVQ8meQdwU+v39qp6sD1+HXAZ8Djg0+1LkrRApgyHqvorRl8XADhxRP8Czp1kX5cCl45ovxl47lRjkSTND18hLUnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnq+BnSy8xknzt92YaD5nkkkpYyZw6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnq+MZ7S9Bkb64nSbPFmYMkqWM4SJI6hoMkqWM4SJI6XpCeA14wlrTUGQ4rxK1/v4OzR4TWtotOXoDRSFrsPK0kSeoYDpKkjuEgSep4zUEjTXZR3WsU0srgzEGS1DEcJEkdw0GS1DEcJEkdw0GS1JkyHJJcmuSBJF8dantikuuS3Nm+H9bak+TiJFuTfCXJC4e2Oav1vzPJWUPtP53k1rbNxUky20VKkqZnnJnDZcCGPdrOBz5TVWuBz7RlgJcDa9vXRuADMAgT4ALgeOBFwAW7A6X12Ti03Z7HkiTNsylf51BVn0uyZo/mU4H17fHlwATw5tZ+RVUVcEOSQ5M8pfW9rqoeBEhyHbAhyQRwSFV9obVfAbwS+PS+FDWVyd5nSJI0MNMXwa2uqvsBqur+JE9u7UcC9w71297a9ta+fUT7SEk2MphlsHr1aiYmJmY2+MfBecftmtG2S9VkNU/2M5zs5zPTn/lC2Llz55Ia775aafWCNc+l2X6F9KjrBTWD9pGqahOwCWDdunW1fv36GQwR/uuHr+Y9t66sF4efd9yukTVvO2P9yP6Tzawm678YTUxMMNPfkaVopdUL1jyXZnq30jfa6SLa9wda+3bg6KF+RwH3TdF+1Ih2SdICmmk4bAZ233F0FnD1UPuZ7a6lE4Ad7fTTtcBLkxzWLkS/FLi2rft2khPaXUpnDu1LkrRApjy3kuQjDC4oH5FkO4O7ji4CrkpyDnAP8KrW/RrgJGAr8F3gtQBV9WCSdwA3tX5v331xGngdgzuiHsfgQvScXoyWJE1tnLuVXj3JqhNH9C3g3En2cylw6Yj2m4HnTjUOSdL88RXSkqSO4SBJ6hgOkqSO4SBJ6qysV4Jpn03340P9uFFpaXLmIEnqOHNY4Sb7y17SyubMQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3feE+zwjfwk5YXw0ELws95kBY3TytJkjqGgySpYzhIkjqGgySp4wVpLQlewJbmlzMHSVLHcJAkdQwHSVLHcJAkdbwgrUVlPt6Gw4vb0tScOUiSOs4ctKTtbaZx2YaD5nEk0vLizEGS1HHmoGXr1r/fwdlzeA3Daxdazpw5SJI6iyYckmxIckeSrUnOX+jxSNJKtihOKyXZD3gf8C+B7cBNSTZX1W0LOzKtJAv1aXaentJitCjCAXgRsLWq7gZIciVwKmA4aMmZrZCZ7n7OO27XyGssMwkZA0upqoUeA0lOAzZU1a+15dcAx1fV6/fotxHY2BZ/Crhjhoc8AvjmDLddqqx5+Vtp9YI1T9cxVfWkcToulplDRrR1qVVVm4BN+3yw5OaqWrev+1lKrHn5W2n1gjXPpcVyQXo7cPTQ8lHAfQs0Fkla8RZLONwErE3y9CT7A6cDmxd4TJK0Yi2K00pVtSvJ64Frgf2AS6tqyxwecp9PTS1B1rz8rbR6wZrnzKK4IC1JWlwWy2klSdIiYjhIkjrLOhymekuOJAck+Whbf2OSNfM/ytkzRr2/meS2JF9J8pkkxyzEOGfTuG+7kuS0JJVkyd/2OE7NSf5V+7fekuRP5nuMs22M3+2nJflski+13++TFmKcsyXJpUkeSPLVSdYnycXt5/GVJC+c9UFU1bL8YnBh+y7gGcD+wJeBY/fo82+BD7bHpwMfXehxz3G9vwT8RHv8uqVc77g1t36PBz4H3ACsW+hxz8O/81rgS8BhbfnJCz3ueah5E/C69vhYYNtCj3sfa/4F4IXAVydZfxLwaQavETsBuHG2x7CcZw6PviVHVT0C7H5LjmGnApe3xx8HTkwy6gV5S8GU9VbVZ6vqu23xBgavJ1nKxvk3BngH8LvA9+ZzcHNknJp/HXhfVT0EUFUPzPMYZ9s4NRdwSHv8BJb466Sq6nPAg3vpcipwRQ3cABya5CmzOYblHA5HAvcOLW9vbSP7VNUuYAdw+LyMbvaNU++wcxj85bGUTVlzkhcAR1fVJ+dzYHNonH/nZwHPSvL5JDck2TBvo5sb49T8NuBXk2wHrgHeMD9DWzDT/f8+bYvidQ5zZJy35BjrbTuWiLFrSfKrwDrgF+d0RHNvrzUneQzwXuDs+RrQPBjn33kVg1NL6xnMDv93kudW1cNzPLa5Mk7NrwYuq6r3JPkZ4EOt5h/N/fAWxJw/dy3nmcM4b8nxaJ8kqxhMR/c2lVvMxnoLkiQvAd4KnFJV35+nsc2VqWp+PPBcYCLJNgbnZjcv8YvS4/5eX11VP6iqrzF4g8q18zS+uTBOzecAVwFU1ReAAxm8Qd1yNedvObScw2Gct+TYDJzVHp8GXF/tas8SNGW97RTLf2MQDEv9PDRMUXNV7aiqI6pqTVWtYXCd5ZSqunlhhjsrxvm9/nMGNx+Q5AgGp5nuntdRzq5xar4HOBEgybMZhMM/zOso59dm4Mx219IJwI6qun82D7BsTyvVJG/JkeTtwM1VtRm4hMH0cyuDGcPpCzfifTNmve8GDgY+1q6731NVpyzYoPfRmDUvK2PWfC3w0iS3AT8E/mNVfWvhRr1vxqz5POAPk/wHBqdXzl7Cf+iR5CMMTgse0a6jXAA8FqCqPsjguspJwFbgu8BrZ30MS/jnJ0maI8v5tJIkaYYMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHX+HzEDS2B1ubMgAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# 以 AMT_CREDIT 為例\n", "app_train['AMT_CREDIT'].hist(bins = 50)\n", "plt.title(\"Original\")\n", "plt.show()\n", "value = app_train['AMT_CREDIT'].values\n", "\n", "app_train['AMT_CREDIT_Norm1'] = ( value - np.mean(value) ) / ( np.std(value) )\n", "app_train['AMT_CREDIT_Norm1'].hist(bins = 50)\n", "plt.title(\"Normalized with Z-transform\")\n", "plt.show()\n", "\n", "app_train['AMT_CREDIT_Norm2'] = ( value - min(value) ) / ( max(value) - min(value) )\n", "app_train['AMT_CREDIT_Norm2'].hist(bins = 50)\n", "plt.title(\"Normalized to 0 ~ 1\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 作業 7\n", "## 處理 outliers\n", "* 新增欄位註記\n", "* outliers 或 NA 填補\n", " 1. 平均數 (mean)\n", " 2. 中位數 (median, or Q50)\n", " 3. 最大/最小值 (max/min, Q100, Q0)\n", " 4. 分位數 (quantile)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Import 需要的套件\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "%matplotlib inline\n", "\n", "# 設定 data_path\n", "dir_data = './data/'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Path of read in data: ./data/application_train.csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SK_ID_CURRTARGETNAME_CONTRACT_TYPECODE_GENDERFLAG_OWN_CARFLAG_OWN_REALTYCNT_CHILDRENAMT_INCOME_TOTALAMT_CREDITAMT_ANNUITY...FLAG_DOCUMENT_18FLAG_DOCUMENT_19FLAG_DOCUMENT_20FLAG_DOCUMENT_21AMT_REQ_CREDIT_BUREAU_HOURAMT_REQ_CREDIT_BUREAU_DAYAMT_REQ_CREDIT_BUREAU_WEEKAMT_REQ_CREDIT_BUREAU_MONAMT_REQ_CREDIT_BUREAU_QRTAMT_REQ_CREDIT_BUREAU_YEAR
01000021Cash loansMNY0202500.0406597.524700.5...00000.00.00.00.00.01.0
11000030Cash loansFNN0270000.01293502.535698.5...00000.00.00.00.00.00.0
21000040Revolving loansMYY067500.0135000.06750.0...00000.00.00.00.00.00.0
31000060Cash loansFNY0135000.0312682.529686.5...0000NaNNaNNaNNaNNaNNaN
41000070Cash loansMNY0121500.0513000.021865.5...00000.00.00.00.00.00.0
\n", "

5 rows × 122 columns

\n", "
" ], "text/plain": [ " SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR \\\n", "0 100002 1 Cash loans M N \n", "1 100003 0 Cash loans F N \n", "2 100004 0 Revolving loans M Y \n", "3 100006 0 Cash loans F N \n", "4 100007 0 Cash loans M N \n", "\n", " FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY \\\n", "0 Y 0 202500.0 406597.5 24700.5 \n", "1 N 0 270000.0 1293502.5 35698.5 \n", "2 Y 0 67500.0 135000.0 6750.0 \n", "3 Y 0 135000.0 312682.5 29686.5 \n", "4 Y 0 121500.0 513000.0 21865.5 \n", "\n", " ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 \\\n", "0 ... 0 0 0 0 \n", "1 ... 0 0 0 0 \n", "2 ... 0 0 0 0 \n", "3 ... 0 0 0 0 \n", "4 ... 0 0 0 0 \n", "\n", " AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY \\\n", "0 0.0 0.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 NaN NaN \n", "4 0.0 0.0 \n", "\n", " AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON \\\n", "0 0.0 0.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 NaN NaN \n", "4 0.0 0.0 \n", "\n", " AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR \n", "0 0.0 1.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 NaN NaN \n", "4 0.0 0.0 \n", "\n", "[5 rows x 122 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f_app = os.path.join(dir_data, 'application_train.csv')\n", "print('Path of read in data: %s' % (f_app))\n", "app_train = pd.read_csv(f_app)\n", "app_train.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# It's your turn\n", "### 1. 列出 AMT_ANNUITY 的 q0 - q100\n", "### 2.1 將 AMT_ANNUITY 中的 NAs 暫時以中位數填補\n", "### 2.2 將 AMT_ANNUITY 的數值標準化至 -1 ~ 1 間\n", "### 3. 將 AMT_GOOD_PRICE 的 NAs 以眾數填補" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1615.5, 6182.910000000001, 6750.0, 7875.0, 8703.0, 9000.0, 9000.0, 9553.5, 10125.0, 10503.0, 11074.5, 11430.0, 11970.0, 12375.0, 12838.5, 13302.0, 13500.0, 13500.0, 13896.0, 14350.5, 14701.5, 15124.5, 15583.5, 15970.5, 16209.0, 16524.0, 16852.5, 17109.0, 17487.0, 17806.5, 18189.0, 18643.5, 19102.5, 19417.5, 19836.0, 20151.0, 20421.0, 20853.0, 21186.0, 21609.0, 21865.5, 22018.5, 22342.5, 22527.0, 22972.5, 23346.0, 23719.5, 23931.0, 24259.5, 24583.589999999953, 24903.0, 25240.5, 25537.5, 25960.5, 26217.0, 26316.0, 26640.0, 26860.5, 27189.0, 27558.0, 28062.0, 28480.5, 28917.0, 29340.0, 29830.5, 30078.0, 30483.0, 30897.0, 31275.0, 31630.5, 32004.0, 32458.5, 32895.0, 33376.5, 33984.0, 34596.0, 35345.16000000005, 35806.5, 36328.5, 36747.0, 37516.5, 37948.5, 38556.0, 39456.0, 40135.5, 40806.0, 41845.5, 42790.5, 43735.5, 44991.0, 45954.0, 47254.5, 48465.0, 49878.0, 51745.5, 53325.0, 55624.5, 58482.0, 62964.0, 70006.5, 258025.5]\n", " q value\n", "0 0 1615.50\n", "1 1 6182.91\n", "2 2 6750.00\n", "3 3 7875.00\n", "4 4 8703.00\n", "5 5 9000.00\n", "6 6 9000.00\n", "7 7 9553.50\n", "8 8 10125.00\n", "9 9 10503.00\n", "10 10 11074.50\n", "11 11 11430.00\n", "12 12 11970.00\n", "13 13 12375.00\n", "14 14 12838.50\n", "15 15 13302.00\n", "16 16 13500.00\n", "17 17 13500.00\n", "18 18 13896.00\n", "19 19 14350.50\n", "20 20 14701.50\n", "21 21 15124.50\n", "22 22 15583.50\n", "23 23 15970.50\n", "24 24 16209.00\n", "25 25 16524.00\n", "26 26 16852.50\n", "27 27 17109.00\n", "28 28 17487.00\n", "29 29 17806.50\n", ".. ... ...\n", "71 71 32458.50\n", "72 72 32895.00\n", "73 73 33376.50\n", "74 74 33984.00\n", "75 75 34596.00\n", "76 76 35345.16\n", "77 77 35806.50\n", "78 78 36328.50\n", "79 79 36747.00\n", "80 80 37516.50\n", "81 81 37948.50\n", "82 82 38556.00\n", "83 83 39456.00\n", "84 84 40135.50\n", "85 85 40806.00\n", "86 86 41845.50\n", "87 87 42790.50\n", "88 88 43735.50\n", "89 89 44991.00\n", "90 90 45954.00\n", "91 91 47254.50\n", "92 92 48465.00\n", "93 93 49878.00\n", "94 94 51745.50\n", "95 95 53325.00\n", "96 96 55624.50\n", "97 97 58482.00\n", "98 98 62964.00\n", "99 99 70006.50\n", "100 100 258025.50\n", "\n", "[101 rows x 2 columns]\n" ] } ], "source": [ "\"\"\"\n", "YOUR CODE HERE\n", "\"\"\"\n", "q_all = list(range(101))\n", "quantile_q_all = [np.percentile(app_train[~app_train['AMT_ANNUITY'].isnull()]['AMT_ANNUITY'], q = i) for i in q_all]\n", "print(quantile_q_all)\n", "\n", "# 1: 計算 AMT_ANNUITY 的 q0 - q100\n", "q_all = pd.DataFrame({'q': list(range(101)),\n", " 'value': quantile_q_all})\n", "\n", "print(q_all)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
qvalue
001615.50
116182.91
226750.00
337875.00
448703.00
559000.00
669000.00
779553.50
8810125.00
9910503.00
101011074.50
111111430.00
121211970.00
131312375.00
141412838.50
151513302.00
161613500.00
171713500.00
181813896.00
191914350.50
202014701.50
212115124.50
222215583.50
232315970.50
242416209.00
252516524.00
262616852.50
272717109.00
282817487.00
292917806.50
.........
717132458.50
727232895.00
737333376.50
747433984.00
757534596.00
767635345.16
777735806.50
787836328.50
797936747.00
808037516.50
818137948.50
828238556.00
838339456.00
848440135.50
858540806.00
868641845.50
878742790.50
888843735.50
898944991.00
909045954.00
919147254.50
929248465.00
939349878.00
949451745.50
959553325.00
969655624.50
979758482.00
989862964.00
999970006.50
100100258025.50
\n", "

101 rows × 2 columns

\n", "
" ], "text/plain": [ " q value\n", "0 0 1615.50\n", "1 1 6182.91\n", "2 2 6750.00\n", "3 3 7875.00\n", "4 4 8703.00\n", "5 5 9000.00\n", "6 6 9000.00\n", "7 7 9553.50\n", "8 8 10125.00\n", "9 9 10503.00\n", "10 10 11074.50\n", "11 11 11430.00\n", "12 12 11970.00\n", "13 13 12375.00\n", "14 14 12838.50\n", "15 15 13302.00\n", "16 16 13500.00\n", "17 17 13500.00\n", "18 18 13896.00\n", "19 19 14350.50\n", "20 20 14701.50\n", "21 21 15124.50\n", "22 22 15583.50\n", "23 23 15970.50\n", "24 24 16209.00\n", "25 25 16524.00\n", "26 26 16852.50\n", "27 27 17109.00\n", "28 28 17487.00\n", "29 29 17806.50\n", ".. ... ...\n", "71 71 32458.50\n", "72 72 32895.00\n", "73 73 33376.50\n", "74 74 33984.00\n", "75 75 34596.00\n", "76 76 35345.16\n", "77 77 35806.50\n", "78 78 36328.50\n", "79 79 36747.00\n", "80 80 37516.50\n", "81 81 37948.50\n", "82 82 38556.00\n", "83 83 39456.00\n", "84 84 40135.50\n", "85 85 40806.00\n", "86 86 41845.50\n", "87 87 42790.50\n", "88 88 43735.50\n", "89 89 44991.00\n", "90 90 45954.00\n", "91 91 47254.50\n", "92 92 48465.00\n", "93 93 49878.00\n", "94 94 51745.50\n", "95 95 53325.00\n", "96 96 55624.50\n", "97 97 58482.00\n", "98 98 62964.00\n", "99 99 70006.50\n", "100 100 258025.50\n", "\n", "[101 rows x 2 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q_all" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "也可以使用 [DataFrame.quantile](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.quantile.html#pandas.DataFrame.quantile) 的方法,得到的答案是一樣的。" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "q_all = app_train['AMT_ANNUITY'].quantile([i/100 for i in range(101)])\n", "q_all = pd.DataFrame({'q': list(range(101)),\n", " 'value': quantile_q_all})" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
qvalue
001615.50
116182.91
226750.00
337875.00
448703.00
559000.00
669000.00
779553.50
8810125.00
9910503.00
101011074.50
111111430.00
121211970.00
131312375.00
141412838.50
151513302.00
161613500.00
171713500.00
181813896.00
191914350.50
202014701.50
212115124.50
222215583.50
232315970.50
242416209.00
252516524.00
262616852.50
272717109.00
282817487.00
292917806.50
.........
717132458.50
727232895.00
737333376.50
747433984.00
757534596.00
767635345.16
777735806.50
787836328.50
797936747.00
808037516.50
818137948.50
828238556.00
838339456.00
848440135.50
858540806.00
868641845.50
878742790.50
888843735.50
898944991.00
909045954.00
919147254.50
929248465.00
939349878.00
949451745.50
959553325.00
969655624.50
979758482.00
989862964.00
999970006.50
100100258025.50
\n", "

101 rows × 2 columns

\n", "
" ], "text/plain": [ " q value\n", "0 0 1615.50\n", "1 1 6182.91\n", "2 2 6750.00\n", "3 3 7875.00\n", "4 4 8703.00\n", "5 5 9000.00\n", "6 6 9000.00\n", "7 7 9553.50\n", "8 8 10125.00\n", "9 9 10503.00\n", "10 10 11074.50\n", "11 11 11430.00\n", "12 12 11970.00\n", "13 13 12375.00\n", "14 14 12838.50\n", "15 15 13302.00\n", "16 16 13500.00\n", "17 17 13500.00\n", "18 18 13896.00\n", "19 19 14350.50\n", "20 20 14701.50\n", "21 21 15124.50\n", "22 22 15583.50\n", "23 23 15970.50\n", "24 24 16209.00\n", "25 25 16524.00\n", "26 26 16852.50\n", "27 27 17109.00\n", "28 28 17487.00\n", "29 29 17806.50\n", ".. ... ...\n", "71 71 32458.50\n", "72 72 32895.00\n", "73 73 33376.50\n", "74 74 33984.00\n", "75 75 34596.00\n", "76 76 35345.16\n", "77 77 35806.50\n", "78 78 36328.50\n", "79 79 36747.00\n", "80 80 37516.50\n", "81 81 37948.50\n", "82 82 38556.00\n", "83 83 39456.00\n", "84 84 40135.50\n", "85 85 40806.00\n", "86 86 41845.50\n", "87 87 42790.50\n", "88 88 43735.50\n", "89 89 44991.00\n", "90 90 45954.00\n", "91 91 47254.50\n", "92 92 48465.00\n", "93 93 49878.00\n", "94 94 51745.50\n", "95 95 53325.00\n", "96 96 55624.50\n", "97 97 58482.00\n", "98 98 62964.00\n", "99 99 70006.50\n", "100 100 258025.50\n", "\n", "[101 rows x 2 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q_all" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Before replace NAs, numbers of row that AMT_ANNUITY is NAs: 12\n", "After replace NAs, numbers of row that AMT_ANNUITY is NAs: 0\n" ] } ], "source": [ "# 2.1 將 NAs 以 q50 填補\n", "print(\"Before replace NAs, numbers of row that AMT_ANNUITY is NAs: %i\" % sum(app_train['AMT_ANNUITY'].isnull()))\n", "\n", "\"\"\"\n", "Your Code Here\n", "\"\"\"\n", "q_50 = app_train['AMT_ANNUITY'].quantile(50/100)\n", "app_train.loc[app_train['AMT_ANNUITY'].isnull(),'AMT_ANNUITY'] = q_50\n", "\n", "print(\"After replace NAs, numbers of row that AMT_ANNUITY is NAs: %i\" % sum(app_train['AMT_ANNUITY'].isnull()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "將NAN填補過後, isnull() 回報總共的 NAN 數量變 0 " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Hints: Normalize function (to -1 ~ 1)\n", "$ y = 2*(\\frac{x - min(x)}{max(x) - min(x)} - 0.5) $" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "== Original data range ==\n", "count 307511.000000\n", "mean 27108.487841\n", "std 14493.461065\n", "min 1615.500000\n", "25% 16524.000000\n", "50% 24903.000000\n", "75% 34596.000000\n", "max 258025.500000\n", "Name: AMT_ANNUITY, dtype: float64\n", "== Normalized data range ==\n" ] }, { "data": { "text/plain": [ "count 307511.000000\n", "mean -0.801154\n", "std 0.113049\n", "min -1.000000\n", "25% -0.883714\n", "50% -0.818357\n", "75% -0.742752\n", "max 1.000000\n", "Name: AMT_ANNUITY_NORMALIZED, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 2.2 Normalize values to -1 to 1\n", "print(\"== Original data range ==\")\n", "print(app_train['AMT_ANNUITY'].describe())\n", "\n", "def normalize_value(x):\n", " min_x = x.min()\n", " max_x = x.max()\n", " y = 2*((x-min_x)/(max_x-min_x) - 0.5)\n", " return y\n", "\n", "app_train['AMT_ANNUITY_NORMALIZED'] = normalize_value(app_train['AMT_ANNUITY'])\n", "\n", "print(\"== Normalized data range ==\")\n", "app_train['AMT_ANNUITY_NORMALIZED'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "也可以用 sklearn 的 MinMaxScaler function 直接套用" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "== Normalized data range ==\n" ] }, { "data": { "text/plain": [ "count 307511.000000\n", "mean -0.801154\n", "std 0.113049\n", "min -1.000000\n", "25% -0.883714\n", "50% -0.818357\n", "75% -0.742752\n", "max 1.000000\n", "Name: AMT_ANNUITY_NORMALIZED, dtype: float64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "def normalize_value(x):\n", " mms = MinMaxScaler((-1,1))\n", " x = mms.fit_transform(x.to_frame())\n", " return x\n", "app_train['AMT_ANNUITY_NORMALIZED'] = normalize_value(app_train['AMT_ANNUITY'])\n", "\n", "print(\"== Normalized data range ==\")\n", "app_train['AMT_ANNUITY_NORMALIZED'].describe()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Before replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: 278\n", "(450000.0, 26022)\n", "After replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: 0\n" ] } ], "source": [ "# 3\n", "print(\"Before replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: %i\" % sum(app_train['AMT_GOODS_PRICE'].isnull()))\n", "\n", "# 列出重複最多的數值\n", "\"\"\"\n", "Your Code Here\n", "\"\"\"\n", "from collections import defaultdict\n", "mode_dict = defaultdict(lambda:0)\n", "\n", "for value in app_train[~app_train['AMT_GOODS_PRICE'].isnull()]['AMT_GOODS_PRICE']:\n", " mode_dict[value] += 1\n", " \n", "mode_get = sorted(mode_dict.items(), key=lambda kv: kv[1], reverse=True)\n", "\n", "value_most = mode_get[0]\n", "print(value_most)\n", "\n", "mode_goods_price = list(app_train['AMT_GOODS_PRICE'].value_counts().index)\n", "app_train.loc[app_train['AMT_GOODS_PRICE'].isnull(), 'AMT_GOODS_PRICE'] = mode_goods_price[0]\n", "\n", "print(\"After replace NAs, numbers of row that AMT_GOODS_PRICE is NAs: %i\" % sum(app_train['AMT_GOODS_PRICE'].isnull()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 1 }