{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# 파이썬 2와 파이썬 3 지원\n", "from __future__ import division, print_function, unicode_literals\n", "\n", "# 공통\n", "import numpy as np\n", "import os\n", "\n", "# 일관된 출력을 위해 유사난수 초기화\n", "np.random.seed(42)\n", "\n", "# 맷플롯립 설정\n", "# 맷플롯립 설정\n", "%matplotlib inline\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "plt.rcParams['axes.labelsize'] = 14\n", "plt.rcParams['xtick.labelsize'] = 12\n", "plt.rcParams['ytick.labelsize'] = 12\n", "\n", "# 한글출력\n", "matplotlib.rc('font', family='NanumBarunGothic')\n", "plt.rcParams['axes.unicode_minus'] = False\n", "\n", "# 그림을 저장할 폴드\n", "PROJECT_ROOT_DIR = \"c:\\\\git\\\\hands_on_ml_link\"\n", "CHAPTER_ID = \"end_to_end_project\"\n", "IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n", "\n", "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n", " path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n", " if tight_layout:\n", " plt.tight_layout()\n", " plt.savefig(path, format=fig_extension, dpi=resolution)\n", "\n", "import os\n", "import tarfile\n", "from six.moves import urllib\n", "\n", "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml/master/\"\n", "HOUSING_PATH = os.path.join(\"c:\\\\git\\\\hands_on_ml_link\",\"datasets\", \"housing\")\n", "HOUSING_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n", "\n", "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n", " if not os.path.isdir(housing_path):\n", " os.makedirs(housing_path)\n", " tgz_path = os.path.join(housing_path, \"housing.tgz\")\n", " urllib.request.urlretrieve(housing_url, tgz_path)\n", " housing_tgz = tarfile.open(tgz_path)\n", " housing_tgz.extractall(path=housing_path)\n", " housing_tgz.close()\n", " \n", "import pandas as pd\n", "\n", "def load_housing_data(housing_path=HOUSING_PATH):\n", " csv_path = os.path.join(housing_path, \"housing.csv\")\n", " return pd.read_csv(csv_path)\n", "\n", "housing = load_housing_data()\n", "\n", "np.random.seed(42)\n", "\n", "import numpy as np\n", "\n", "# 예시를 위해서 만든 것입니다. 사이킷런에는 train_test_split() 함수가 있습니다.\n", "def split_train_test(data, test_ratio):\n", " shuffled_indices = np.random.permutation(len(data))\n", " test_set_size = int(len(data) * test_ratio)\n", " test_indices = shuffled_indices[:test_set_size]\n", " train_indices = shuffled_indices[test_set_size:]\n", " return data.iloc[train_indices], data.iloc[test_indices]\n", "\n", "train_set, test_set = split_train_test(housing, 0.2)\n", "\n", "\n", "from zlib import crc32\n", "\n", "def test_set_check(identifier, test_ratio):\n", " return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32\n", "\n", "def split_train_test_by_id(data, test_ratio, id_column):\n", " ids = data[id_column]\n", " in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))\n", " return data.loc[~in_test_set], data.loc[in_test_set]\n", "\n", "import hashlib\n", "\n", "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n", "\n", "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n", "\n", "housing_with_id = housing.reset_index() # `index` 열이 추가된 데이터프레임이 반환됩니다.\n", "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")\n", "\n", "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n", "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"id\")\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n", "\n", "# 소득 카테고리 개수를 제한하기 위해 1.5로 나눕니다.\n", "housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n", "# 5 이상은 5로 레이블합니다.\n", "housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)\n", "\n", "from sklearn.model_selection import StratifiedShuffleSplit\n", "\n", "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n", "for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n", " strat_train_set = housing.loc[train_index]\n", " strat_test_set = housing.loc[test_index]\n", "\n", "def income_cat_proportions(data):\n", " return data[\"income_cat\"].value_counts() / len(data)\n", "\n", "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n", "\n", "compare_props = pd.DataFrame({\n", " \"Overall\": income_cat_proportions(housing),\n", " \"Stratified\": income_cat_proportions(strat_test_set),\n", " \"Random\": income_cat_proportions(test_set),\n", "}).sort_index()\n", "compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n", "compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100\n", "\n", "for set_ in (strat_train_set, strat_test_set):\n", " set_.drop(\"income_cat\", axis=1, inplace=True)\n", "\n", "housing = strat_train_set.copy()\n", "\n", "housing = strat_train_set.drop(\"median_house_value\", axis=1) # 훈련 세트를 위해 레이블 삭제\n", "housing_labels = strat_train_set[\"median_house_value\"].copy()\n", "\n", "sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()\n", "\n", "from sklearn.impute import SimpleImputer\n", "\n", "imputer = SimpleImputer(strategy=\"median\")\n", "\n", "housing_num = housing.drop('ocean_proximity', axis=1)\\\n", " \n", "imputer.fit(housing_num)\n", "housing_num.median().values\n", "X = imputer.transform(housing_num)\n", "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", " index = list(housing.index.values))\n", "housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n", "\n", "housing_cat = housing['ocean_proximity']\n", "housing_cat_encoded, housing_categories = housing_cat.factorize()\n", "from sklearn.preprocessing import OneHotEncoder\n", "\n", "encoder = OneHotEncoder(categories='auto')\n", "housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.utils import check_array\n", "from sklearn.preprocessing import LabelEncoder\n", "from scipy import sparse\n", "\n", "class CategoricalEncoder(BaseEstimator, TransformerMixin):\n", " def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,\n", " handle_unknown='error'):\n", " self.encoding = encoding\n", " self.categories = categories\n", " self.dtype = dtype\n", " self.handle_unknown = handle_unknown\n", "\n", " def fit(self, X, y=None):\n", " \"\"\"Fit the CategoricalEncoder to X.\n", " Parameters\n", " ----------\n", " X : array-like, shape [n_samples, n_feature]\n", " The data to determine the categories of each feature.\n", " Returns\n", " -------\n", " self\n", " \"\"\"\n", "\n", " if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:\n", " template = (\"encoding should be either 'onehot', 'onehot-dense' \"\n", " \"or 'ordinal', got %s\")\n", " raise ValueError(template % self.handle_unknown)\n", "\n", " if self.handle_unknown not in ['error', 'ignore']:\n", " template = (\"handle_unknown should be either 'error' or \"\n", " \"'ignore', got %s\")\n", " raise ValueError(template % self.handle_unknown)\n", "\n", " if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':\n", " raise ValueError(\"handle_unknown='ignore' is not supported for\"\n", " \" encoding='ordinal'\")\n", "\n", " X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)\n", " n_samples, n_features = X.shape\n", "\n", " self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]\n", "\n", " for i in range(n_features):\n", " le = self._label_encoders_[i]\n", " Xi = X[:, i]\n", " if self.categories == 'auto':\n", " le.fit(Xi)\n", " else:\n", " valid_mask = np.in1d(Xi, self.categories[i])\n", " if not np.all(valid_mask):\n", " if self.handle_unknown == 'error':\n", " diff = np.unique(Xi[~valid_mask])\n", " msg = (\"Found unknown categories {0} in column {1}\"\n", " \" during fit\".format(diff, i))\n", " raise ValueError(msg)\n", " le.classes_ = np.array(np.sort(self.categories[i]))\n", "\n", " self.categories_ = [le.classes_ for le in self._label_encoders_]\n", "\n", " return self\n", "\n", " def transform(self, X):\n", " \"\"\"Transform X using one-hot encoding.\n", " Parameters\n", " ----------\n", " X : array-like, shape [n_samples, n_features]\n", " The data to encode.\n", " Returns\n", " -------\n", " X_out : sparse matrix or a 2-d array\n", " Transformed input.\n", " \"\"\"\n", " X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)\n", " n_samples, n_features = X.shape\n", " X_int = np.zeros_like(X, dtype=np.int)\n", " X_mask = np.ones_like(X, dtype=np.bool)\n", "\n", " for i in range(n_features):\n", " valid_mask = np.in1d(X[:, i], self.categories_[i])\n", "\n", " if not np.all(valid_mask):\n", " if self.handle_unknown == 'error':\n", " diff = np.unique(X[~valid_mask, i])\n", " msg = (\"Found unknown categories {0} in column {1}\"\n", " \" during transform\".format(diff, i))\n", " raise ValueError(msg)\n", " else:\n", " # Set the problematic rows to an acceptable value and\n", " # continue `The rows are marked `X_mask` and will be\n", " # removed later.\n", " X_mask[:, i] = valid_mask\n", " X[:, i][~valid_mask] = self.categories_[i][0]\n", " X_int[:, i] = self._label_encoders_[i].transform(X[:, i])\n", "\n", " if self.encoding == 'ordinal':\n", " return X_int.astype(self.dtype, copy=False)\n", "\n", " mask = X_mask.ravel()\n", " n_values = [cats.shape[0] for cats in self.categories_]\n", " n_values = np.array([0] + n_values)\n", " indices = np.cumsum(n_values)\n", "\n", " column_indices = (X_int + indices[:-1]).ravel()[mask]\n", " row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),\n", " n_features)[mask]\n", " data = np.ones(n_samples * n_features)[mask]\n", "\n", " out = sparse.csc_matrix((data, (row_indices, column_indices)),\n", " shape=(n_samples, indices[-1]),\n", " dtype=self.dtype).tocsr()\n", " if self.encoding == 'onehot-dense':\n", " return out.toarray()\n", " else:\n", " return out\n", "\n", "cat_encoder = CategoricalEncoder()\n", "housing_cat_reshaped = housing_cat.values.reshape(-1, 1)\n", "housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)\n", "from sklearn.preprocessing import OneHotEncoder\n", "\n", "cat_encoder = OneHotEncoder(categories='auto')\n", "housing_cat_reshaped = housing_cat.values.reshape(-1, 1)\n", "housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)\n", "cat_encoder = OneHotEncoder(categories='auto', sparse=False)\n", "housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)\n", "cat_encoder.categories_\n", "housing_cat = housing[['ocean_proximity']]\n", "from sklearn.preprocessing import OrdinalEncoder\n", "ordinal_encoder = OrdinalEncoder()\n", "housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)\n", "from sklearn.preprocessing import OneHotEncoder\n", "\n", "cat_encoder = OneHotEncoder(categories='auto')\n", "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n", "cat_encoder = OneHotEncoder(categories='auto', sparse=False)\n", "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", "# 컬럼 인덱스\n", "rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n", "\n", "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n", " def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs\n", " self.add_bedrooms_per_room = add_bedrooms_per_room\n", " def fit(self, X, y=None):\n", " return self # nothing else to do\n", " def transform(self, X, y=None):\n", " rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n", " population_per_household = X[:, population_ix] / X[:, household_ix]\n", " if self.add_bedrooms_per_room:\n", " bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n", " return np.c_[X, rooms_per_household, population_per_household,\n", " bedrooms_per_room]\n", " else:\n", " return np.c_[X, rooms_per_household, population_per_household]\n", "\n", "attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n", "housing_extra_attribs = attr_adder.transform(housing.values)\n", "housing_extra_attribs = pd.DataFrame(\n", " housing_extra_attribs, \n", " columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "num_pipeline = Pipeline([\n", " ('imputer', SimpleImputer(strategy=\"median\")),\n", " ('attribs_adder', CombinedAttributesAdder()),\n", " ('std_scaler', StandardScaler()),\n", " ])\n", "\n", "housing_num_tr = num_pipeline.fit_transform(housing_num)\n", "from sklearn.compose import ColumnTransformer\n", "\n", "num_attribs = list(housing_num)\n", "cat_attribs = [\"ocean_proximity\"]\n", "\n", "full_pipeline = ColumnTransformer([\n", " (\"num\", num_pipeline, num_attribs),\n", " (\"cat\", OneHotEncoder(categories='auto'), cat_attribs),\n", " ])\n", "\n", "housing_prepared = full_pipeline.fit_transform(housing)\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", "# 사이킷런이 DataFrame을 바로 사용하지 못하므로\n", "# 수치형이나 범주형 컬럼을 선택하는 클래스를 만듭니다.\n", "class DataFrameSelector(BaseEstimator, TransformerMixin):\n", " def __init__(self, attribute_names):\n", " self.attribute_names = attribute_names\n", " def fit(self, X, y=None):\n", " return self\n", " def transform(self, X):\n", " return X[self.attribute_names].values\n", "num_attribs = list(housing_num)\n", "cat_attribs = [\"ocean_proximity\"]\n", "\n", "num_pipeline = Pipeline([\n", " ('selector', DataFrameSelector(num_attribs)),\n", " ('imputer', SimpleImputer(strategy=\"median\")),\n", " ('attribs_adder', CombinedAttributesAdder()),\n", " ('std_scaler', StandardScaler()),\n", " ])\n", "\n", "cat_pipeline = Pipeline([\n", " ('selector', DataFrameSelector(cat_attribs)),\n", " ('cat_encoder', CategoricalEncoder(encoding=\"onehot-dense\")),\n", " ])\n", "full_pipeline = ColumnTransformer([\n", " (\"num_pipeline\", num_pipeline, num_attribs),\n", " (\"cat_encoder\", OneHotEncoder(categories='auto'), cat_attribs),\n", " ])\n", "housing_prepared = full_pipeline.fit_transform(housing)\n", "\n", "\n", "df = pd.DataFrame(np.arange(10, 22).reshape(3, 4),\n", " index=[\"a\", \"b\", \"c\"],\n", " columns=[\"A\", \"B\", \"C\", \"D\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Chapter 2. 머신러닝 프로젝트 처음부터 끝까지\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "## 2.6 모델 선택과 훈련" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.6.1 훈련 세트에서 훈련하고 평가하기" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 선형 회귀 모델을 훈련" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n", " normalize=False)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LinearRegression\n", "\n", "lin_reg = LinearRegression() # 선형 회귀 모델\n", "lin_reg.fit(housing_prepared, housing_labels) # 훈련, 매개변수(Train data, Target values)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- 훈련 세트에 있는 몇 개 샘플에 대해 적용" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | longitude | \n", "latitude | \n", "housing_median_age | \n", "total_rooms | \n", "total_bedrooms | \n", "population | \n", "households | \n", "median_income | \n", "ocean_proximity | \n", "
---|---|---|---|---|---|---|---|---|---|
17606 | \n", "-121.89 | \n", "37.29 | \n", "38.0 | \n", "1568.0 | \n", "351.0 | \n", "710.0 | \n", "339.0 | \n", "2.7042 | \n", "<1H OCEAN | \n", "
18632 | \n", "-121.93 | \n", "37.05 | \n", "14.0 | \n", "679.0 | \n", "108.0 | \n", "306.0 | \n", "113.0 | \n", "6.4214 | \n", "<1H OCEAN | \n", "
14650 | \n", "-117.20 | \n", "32.77 | \n", "31.0 | \n", "1952.0 | \n", "471.0 | \n", "936.0 | \n", "462.0 | \n", "2.8621 | \n", "NEAR OCEAN | \n", "
3230 | \n", "-119.61 | \n", "36.31 | \n", "25.0 | \n", "1847.0 | \n", "371.0 | \n", "1460.0 | \n", "353.0 | \n", "1.8839 | \n", "INLAND | \n", "
3555 | \n", "-118.59 | \n", "34.23 | \n", "17.0 | \n", "6592.0 | \n", "1525.0 | \n", "4459.0 | \n", "1463.0 | \n", "3.0347 | \n", "<1H OCEAN | \n", "