In [2]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn import tree

In [16]:
data_mat = pd.read_csv("student-mat.csv", delimiter=';')
data_por = pd.read_csv("student-por.csv", delimiter=';')

categorical_features_name = [
 "school", 
 "sex", 
 "address", 
 "famsize", 
 "Pstatus", 
 "Mjob", 
 "Fjob", 
 "reason", 
 "guardian", 
 "schoolsup", 
 "famsup", 
 "paid", 
 "activities", 
 "nursery", 
 "higher", 
 "internet", 
 "romantic"
]

choosed_features = [
 
]

target_features = [
 "Dalc",
 "Walc"
]

In [17]:
# Обработка данных. Кодирование категориальных признаков единичными векторами с помощью OneHotEncoder'а
def preprocessing(data, columns=data_mat.columns):
 X = data.loc[:, data.columns.isin(columns)]
 for target in target_features:
 if X.columns.contains(target):
 X = X.drop(columns=target)

 X.loc[:, X.columns.isin(categorical_features_name)] = \
 X.loc[:, X.columns.isin(categorical_features_name)].apply(LabelEncoder().fit_transform)
 
 enc = OneHotEncoder()
 X_cat = enc.fit_transform(X.loc[:, X.columns.isin(categorical_features_name)]).todense()
 X = np.concatenate([X_cat, X.loc[:, ~X.columns.isin(categorical_features_name)].values], axis=1)
 
 return X, data.loc[:, "Walc"], data.loc[:, "Dalc"]

In [18]:
l = ["paid", "schoolsup", "activities", "higher", "freetime", "goout", "male"]

# Random forest для задачи классификации

Будем рассматривать задачи классификации по 5 классам. Предсказываемые переменные - "Dalc" и "Walc".

Применим для её решения ансамбль решающих деревьев, используя выбранные выше признаки.

In [19]:
X, y_w, y_d = preprocessing(data_mat, l)
rf_clf = RandomForestClassifier(class_weight = "balanced")
rf_param_dist ={"n_estimators" : np.arange(10, 100, 10),
 "max_depth": sp_randint(1, 31),
 "max_features": sp_randint(1, 11),
 "min_samples_leaf": sp_randint(1, 11),
 "max_features" : ["auto", "sqrt", "log2"],
 "criterion": ["gini", "entropy"]}

n_iter_search = 250
rf_random_search = RandomizedSearchCV(rf_clf, param_distributions = rf_param_dist,
 n_iter = n_iter_search, random_state = 42)

print("Random forest for Walc")
x_train, x_test, y_train, y_test = train_test_split(X, y_w, test_size=0.3, random_state=42, stratify = y_w)
rf_random_search.fit(x_train, y_train)
rf_clf = rf_random_search.best_estimator_
rf_val_score = rf_clf.score(x_test, y_test)
print("Best randomized search score - %s" % round(rf_random_search.best_score_, 2))
print("Validation score - %s" % round(rf_val_score, 2))

print("Random forest for Dalc")
x_train, x_test, y_train, y_test = train_test_split(X, y_d, test_size=0.3, random_state=42, stratify = y_d)
rf_random_search.fit(x_train, y_train)
rf_clf = rf_random_search.best_estimator_
rf_val_score = rf_clf.score(x_test, y_test)
print("Best randomized search score - %s" % round(rf_random_search.best_score_, 2))
print("Validation score - %s" % round(rf_val_score, 2))

Random forest for Walc
Best randomized search score - 0.37
Validation score - 0.33
Random forest for Dalc
Best randomized search score - 0.53
Validation score - 0.58


# Переход к новым признакам
Постараемся сгенерировать новые признаки на базе старых. 
Будем искать оптимальные параметры для них.

In [20]:
students = data_mat.append(data_por).reset_index(drop=True)
students_transformed = students
students_transformed['rural'] = students['address'] == "R"
students_transformed['big_family'] = students['famsize'] == 'GT3'
students_transformed['parents_together'] = students['Pstatus'] == "T"
students_transformed['studies_less'] = students['studytime'] < 3
students_transformed['more_failures'] = students['failures'] >= 2
students_transformed['bad_relationships'] = students['famrel'] <= 2
students_transformed['more_free_time'] = students['freetime'] > 3
students_transformed['goes_out_more'] = students['goout'] > 4
students_transformed['bad_health'] = students['health'] <= 2
students_transformed['high_absences'] = students['absences'] > (students['absences'].std() * 2)
students_transformed['mothers_low_edu'] = students['Medu'] <= 3
students_transformed['fathers_low_edu'] = students['Fedu'] <= 3
students_transformed['more_than_18'] = students['age'] > 18
students_transformed['long_road'] = students['traveltime'] >= 3

sum_grade = students['G1'] + students['G2'] + students['G3']
mean_grade = (sum_grade) / 3 
students_transformed['low_grade'] = mean_grade <= (mean_grade.mean() + mean_grade.std())

students_transformed = students_transformed.drop(['sex',
 'address',
 'famsize',
 'Pstatus',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences',
 'G1',
 'G2',
 'G3',
 'Medu',
 'Fedu',
 'age',
 ], axis=1)

In [21]:
X, y_w, y_d = preprocessing(students_transformed)

In [22]:
print("Random forest for Walc")
x_train, x_test, y_train, y_test = train_test_split(X, y_w, test_size=0.3, random_state=42, stratify = y_w)
rf_random_search.fit(x_train, y_train)
rf_clf = rf_random_search.best_estimator_
rf_val_score = rf_clf.score(x_test, y_test)
print("Best randomized search score - %s" % round(rf_random_search.best_score_, 2))
print("Validation score - %s" % round(rf_val_score, 2))

print("Random forest for Dalc")
x_train, x_test, y_train, y_test = train_test_split(X, y_d, test_size=0.3, random_state=42, stratify = y_d)
rf_random_search.fit(x_train, y_train)
rf_clf = rf_random_search.best_estimator_
rf_val_score = rf_clf.score(x_test, y_test)
print("Best randomized search score - %s" % round(rf_random_search.best_score_, 2))
print("Validation score - %s" % round(rf_val_score, 2))

Random forest for Walc
Best randomized search score - 0.5
Validation score - 0.59
Random forest for Dalc
Best randomized search score - 0.74
Validation score - 0.74


## Смена задачи

Результаты для предсказания значения величины "Walc" оставляют желать лучшего. 
Заметим, что подавляющие большинство значений целевой переменной - 1. 
Возможно, правильнее будет для начала решить задачу: 
Употребляет ли студент очень мало алкоголя или нет, то есть имеют ли "Dalc" и "Walc" значение не 1.

In [23]:
y = np.zeros_like(y_w)
for i in range(len(y_w)):
 if y_d[i] != 1 or y_w[i] != 1:
 y[i] = 1
 else:
 y[i] = -1

In [24]:
classifier = RandomForestClassifier(max_depth=4, bootstrap=False)
sc = cross_val_score(classifier, X, y, cv=5)
print("Random forest: ", round(sc.mean(), 2))

Random forest: 0.66


Попробуем провести такое для разных уровней алкоголизма

In [25]:
y = np.zeros_like(y_w)
for i in range(len(y_w)):
 if y_d[i] + y_w[i] > 3:
 y[i] = 1
 else:
 y[i] = -1

In [26]:
classifier = RandomForestClassifier(max_depth=4, bootstrap=False)
sc = cross_val_score(classifier, X, y, cv=5)
print("Random forest: ", round(sc.mean(), 2))

Random forest: 0.59


# Выводы
Подход к данной задаче как к задаче классификации не дал хороших результатов на данном этапе. 
Вероятно, стоит подойти к проблеме с другой стороны и рассмотреть данную задачу как задачу регрессии.