{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "from glob import glob\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "from ngboost import NGBClassifier\n", "from ngboost.distns import Bernoulli,k_categorical\n", "from ngboost.learners import default_tree_learner" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import classification_report\n", "from display_results import confusion_matrix" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df_activity = pd.read_csv(\"activities.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def segmentation(x_data,y,overlap_rate,time_window):\n", " \n", " seg_data = []\n", " overlap = int((1 - overlap_rate)*time_window)\n", " y_segmented_list = []\n", " \n", " for i in range(0,x_data.shape[0],overlap):\n", " seg_data.append(x_data[i:i+time_window])\n", " y_segmented_list.append(y)\n", "\n", " return seg_data,y_segmented_list" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def handle_missing_values(df):\n", " df['x']=df['x'].replace(0, np.nan)\n", " df['y']=df['y'].replace(0, np.nan)\n", " df['z']=df['z'].replace(0, np.nan)\n", " return df" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "change_act_id = {2:0, 3:1, 4:2, 6:3, 9:4, 12:5}" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def get_act_id(seg_id):\n", " seg = df_activity[df_activity[\"segment_id\"]==seg_id]\n", " activity_id = seg[\"activity_id\"].values\n", " return change_act_id[int(activity_id)]" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "def load_data(csv_file):\n", "\n", " y_list = []\n", " x_data_list = []\n", "\n", " csv_df = pd.read_csv(csv_file)\n", " csv_df = handle_missing_values(csv_df)\n", " csv_df.dropna(inplace=True)\n", " x_data = csv_df.values\n", " act_id = get_act_id(int(os.path.splitext(os.path.basename(csv_file))[0].replace(\"segment\",\"\")))\n", " \n", " return x_data,act_id" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "def get_features(x_data):\n", " features = []\n", " for i in range(x_data.shape[1]):\n", " # std\n", " features.append(x_data.T[i].std(ddof=0))\n", " # avg\n", " features.append(np.average(x_data.T[i]))\n", " # max\n", " features.append(np.max(x_data.T[i]))\n", " # min\n", " features.append(np.min(x_data.T[i]))\n", " return features" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "csv_files = glob(\"train/*\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "X_feature_data_list = []\n", "y_list = []\n", "for csv_file in csv_files:\n", " x,y = load_data(csv_file)\n", " X_feature_data_list.append(get_features(x))\n", " y_list.append(y)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "X_train,X_test,y_train,y_test = train_test_split(X_feature_data_list,y_list,test_size=0.3)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "model_ml = RandomForestClassifier(n_estimators=500,n_jobs=-1)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "model_ml.fit(X_train,y_train)\n", "y_predict = model_ml.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.42 0.83 0.56 6\n", " 1 1.00 0.67 0.80 15\n", " 2 0.67 0.29 0.40 7\n", " 3 0.50 0.33 0.40 6\n", " 4 0.43 0.60 0.50 5\n", " 5 0.67 0.80 0.73 15\n", "\n", " accuracy 0.63 54\n", " macro avg 0.61 0.59 0.56 54\n", "weighted avg 0.69 0.63 0.63 54\n", "\n" ] } ], "source": [ "print(classification_report(y_test,y_predict))" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "None\n" ] } ], "source": [ "print(confusion_matrix(y_test, y_predict))" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[iter 0] loss=1.7464 val_loss=0.0000 scale=0.5000 norm=3.6180\n", "[iter 100] loss=0.5734 val_loss=0.0000 scale=2.0000 norm=4.4797\n", "[iter 200] loss=0.2855 val_loss=0.0000 scale=4.0000 norm=6.6994\n", "[iter 300] loss=0.2065 val_loss=0.0000 scale=2.0000 norm=3.0851\n", "[iter 400] loss=0.1836 val_loss=0.0000 scale=1.0000 norm=1.5061\n" ] }, { "data": { "text/plain": [ "NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,\n", " criterion='friedman_mse', max_depth=3,\n", " max_features=None, max_leaf_nodes=None,\n", " min_impurity_decrease=0.0,\n", " min_impurity_split=None,\n", " min_samples_leaf=1,\n", " min_samples_split=2,\n", " min_weight_fraction_leaf=0.0,\n", " presort='deprecated',\n", " random_state=None, splitter='best'),\n", " Dist=.Categorical'>,\n", " Score=, col_sample=1.0,\n", " learning_rate=0.01, minibatch_frac=1.0, n_estimators=500,\n", " natural_gradient=True,\n", " random_state=RandomState(MT19937) at 0x110FAFDB0, tol=0.0001,\n", " verbose=True, verbose_eval=100)" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ngb = NGBClassifier(Dist = k_categorical(6))\n", "ngb.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "y_predict_ngboost =ngb.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.50 0.75 0.60 4\n", " 1 0.67 0.62 0.65 16\n", " 2 0.29 0.29 0.29 7\n", " 3 0.17 0.33 0.22 3\n", " 4 0.40 0.40 0.40 5\n", " 5 0.73 0.58 0.65 19\n", "\n", " accuracy 0.54 54\n", " macro avg 0.46 0.50 0.47 54\n", "weighted avg 0.58 0.54 0.55 54\n", "\n" ] } ], "source": [ "print(classification_report(y_predict_ngboost,y_test))" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "confusion_matrix(y_predict_ngboost,y_test)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }