{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.grid_search import RandomizedSearchCV\n", "from scipy.stats import randint as sp_randint\n", "from sklearn import tree" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "data_mat = pd.read_csv(\"student-mat.csv\", delimiter=';')\n", "data_por = pd.read_csv(\"student-por.csv\", delimiter=';')\n", "\n", "categorical_features_name = [\n", " \"school\", \n", " \"sex\", \n", " \"address\", \n", " \"famsize\", \n", " \"Pstatus\", \n", " \"Mjob\", \n", " \"Fjob\", \n", " \"reason\", \n", " \"guardian\", \n", " \"schoolsup\", \n", " \"famsup\", \n", " \"paid\", \n", " \"activities\", \n", " \"nursery\", \n", " \"higher\", \n", " \"internet\", \n", " \"romantic\"\n", "]\n", "\n", "choosed_features = [\n", " \n", "]\n", "\n", "target_features = [\n", " \"Dalc\",\n", " \"Walc\"\n", "]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Обработка данных. Кодирование категориальных признаков единичными векторами с помощью OneHotEncoder'а\n", "def preprocessing(data, columns=data_mat.columns):\n", " X = data.loc[:, data.columns.isin(columns)]\n", " for target in target_features:\n", " if X.columns.contains(target):\n", " X = X.drop(columns=target)\n", "\n", " X.loc[:, X.columns.isin(categorical_features_name)] = \\\n", " X.loc[:, X.columns.isin(categorical_features_name)].apply(LabelEncoder().fit_transform)\n", " \n", " enc = OneHotEncoder()\n", " X_cat = enc.fit_transform(X.loc[:, X.columns.isin(categorical_features_name)]).todense()\n", " X = np.concatenate([X_cat, X.loc[:, ~X.columns.isin(categorical_features_name)].values], axis=1)\n", " \n", " return X, data.loc[:, \"Walc\"], data.loc[:, \"Dalc\"]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "l = [\"paid\", \"schoolsup\", \"activities\", \"higher\", \"freetime\", \"goout\", \"male\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Random forest для задачи классификации\n", "\n", "Будем рассматривать задачи классификации по 5 классам. Предсказываемые переменные - \"Dalc\" и \"Walc\".\n", "\n", "Применим для её решения ансамбль решающих деревьев, используя выбранные выше признаки." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random forest for Walc\n", "Best randomized search score - 0.37\n", "Validation score - 0.33\n", "Random forest for Dalc\n", "Best randomized search score - 0.53\n", "Validation score - 0.58\n" ] } ], "source": [ "X, y_w, y_d = preprocessing(data_mat, l)\n", "rf_clf = RandomForestClassifier(class_weight = \"balanced\")\n", "rf_param_dist ={\"n_estimators\" : np.arange(10, 100, 10),\n", " \"max_depth\": sp_randint(1, 31),\n", " \"max_features\": sp_randint(1, 11),\n", " \"min_samples_leaf\": sp_randint(1, 11),\n", " \"max_features\" : [\"auto\", \"sqrt\", \"log2\"],\n", " \"criterion\": [\"gini\", \"entropy\"]}\n", "\n", "n_iter_search = 250\n", "rf_random_search = RandomizedSearchCV(rf_clf, param_distributions = rf_param_dist,\n", " n_iter = n_iter_search, random_state = 42)\n", "\n", "print(\"Random forest for Walc\")\n", "x_train, x_test, y_train, y_test = train_test_split(X, y_w, test_size=0.3, random_state=42, stratify = y_w)\n", "rf_random_search.fit(x_train, y_train)\n", "rf_clf = rf_random_search.best_estimator_\n", "rf_val_score = rf_clf.score(x_test, y_test)\n", "print(\"Best randomized search score - %s\" % round(rf_random_search.best_score_, 2))\n", "print(\"Validation score - %s\" % round(rf_val_score, 2))\n", "\n", "print(\"Random forest for Dalc\")\n", "x_train, x_test, y_train, y_test = train_test_split(X, y_d, test_size=0.3, random_state=42, stratify = y_d)\n", "rf_random_search.fit(x_train, y_train)\n", "rf_clf = rf_random_search.best_estimator_\n", "rf_val_score = rf_clf.score(x_test, y_test)\n", "print(\"Best randomized search score - %s\" % round(rf_random_search.best_score_, 2))\n", "print(\"Validation score - %s\" % round(rf_val_score, 2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Переход к новым признакам\n", "Постараемся сгенерировать новые признаки на базе старых. \n", "Будем искать оптимальные параметры для них." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "students = data_mat.append(data_por).reset_index(drop=True)\n", "students_transformed = students\n", "students_transformed['rural'] = students['address'] == \"R\"\n", "students_transformed['big_family'] = students['famsize'] == 'GT3'\n", "students_transformed['parents_together'] = students['Pstatus'] == \"T\"\n", "students_transformed['studies_less'] = students['studytime'] < 3\n", "students_transformed['more_failures'] = students['failures'] >= 2\n", "students_transformed['bad_relationships'] = students['famrel'] <= 2\n", "students_transformed['more_free_time'] = students['freetime'] > 3\n", "students_transformed['goes_out_more'] = students['goout'] > 4\n", "students_transformed['bad_health'] = students['health'] <= 2\n", "students_transformed['high_absences'] = students['absences'] > (students['absences'].std() * 2)\n", "students_transformed['mothers_low_edu'] = students['Medu'] <= 3\n", "students_transformed['fathers_low_edu'] = students['Fedu'] <= 3\n", "students_transformed['more_than_18'] = students['age'] > 18\n", "students_transformed['long_road'] = students['traveltime'] >= 3\n", "\n", "sum_grade = students['G1'] + students['G2'] + students['G3']\n", "mean_grade = (sum_grade) / 3 \n", "students_transformed['low_grade'] = mean_grade <= (mean_grade.mean() + mean_grade.std())\n", "\n", "students_transformed = students_transformed.drop(['sex',\n", " 'address',\n", " 'famsize',\n", " 'Pstatus',\n", " 'studytime',\n", " 'failures',\n", " 'famrel',\n", " 'freetime',\n", " 'goout',\n", " 'health',\n", " 'absences',\n", " 'G1',\n", " 'G2',\n", " 'G3',\n", " 'Medu',\n", " 'Fedu',\n", " 'age',\n", " ], axis=1)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "X, y_w, y_d = preprocessing(students_transformed)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random forest for Walc\n", "Best randomized search score - 0.5\n", "Validation score - 0.59\n", "Random forest for Dalc\n", "Best randomized search score - 0.74\n", "Validation score - 0.74\n" ] } ], "source": [ "print(\"Random forest for Walc\")\n", "x_train, x_test, y_train, y_test = train_test_split(X, y_w, test_size=0.3, random_state=42, stratify = y_w)\n", "rf_random_search.fit(x_train, y_train)\n", "rf_clf = rf_random_search.best_estimator_\n", "rf_val_score = rf_clf.score(x_test, y_test)\n", "print(\"Best randomized search score - %s\" % round(rf_random_search.best_score_, 2))\n", "print(\"Validation score - %s\" % round(rf_val_score, 2))\n", "\n", "print(\"Random forest for Dalc\")\n", "x_train, x_test, y_train, y_test = train_test_split(X, y_d, test_size=0.3, random_state=42, stratify = y_d)\n", "rf_random_search.fit(x_train, y_train)\n", "rf_clf = rf_random_search.best_estimator_\n", "rf_val_score = rf_clf.score(x_test, y_test)\n", "print(\"Best randomized search score - %s\" % round(rf_random_search.best_score_, 2))\n", "print(\"Validation score - %s\" % round(rf_val_score, 2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Смена задачи\n", "\n", "Результаты для предсказания значения величины \"Walc\" оставляют желать лучшего. \n", "Заметим, что подавляющие большинство значений целевой переменной - 1. \n", "Возможно, правильнее будет для начала решить задачу: \n", "Употребляет ли студент очень мало алкоголя или нет, то есть имеют ли \"Dalc\" и \"Walc\" значение не 1." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "y = np.zeros_like(y_w)\n", "for i in range(len(y_w)):\n", " if y_d[i] != 1 or y_w[i] != 1:\n", " y[i] = 1\n", " else:\n", " y[i] = -1" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random forest: 0.66\n" ] } ], "source": [ "classifier = RandomForestClassifier(max_depth=4, bootstrap=False)\n", "sc = cross_val_score(classifier, X, y, cv=5)\n", "print(\"Random forest: \", round(sc.mean(), 2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Попробуем провести такое для разных уровней алкоголизма" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "y = np.zeros_like(y_w)\n", "for i in range(len(y_w)):\n", " if y_d[i] + y_w[i] > 3:\n", " y[i] = 1\n", " else:\n", " y[i] = -1" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random forest: 0.59\n" ] } ], "source": [ "classifier = RandomForestClassifier(max_depth=4, bootstrap=False)\n", "sc = cross_val_score(classifier, X, y, cv=5)\n", "print(\"Random forest: \", round(sc.mean(), 2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Выводы\n", "Подход к данной задаче как к задаче классификации не дал хороших результатов на данном этапе. \n", "Вероятно, стоит подойти к проблеме с другой стороны и рассмотреть данную задачу как задачу регрессии." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 1 }