{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Decision trees\n",
    "## Balanced, oversampling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd #import libraries\n",
    "import numpy as np\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn import preprocessing\n",
    "from sklearn.model_selection import TimeSeriesSplit\n",
    "from sklearn.metrics import precision_score\n",
    "from sklearn.metrics import recall_score\n",
    "from sklearn.metrics import f1_score\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "\n",
    "#import data\n",
    "df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime.csv',skip_blank_lines=True)\n",
    "\n",
    "list1=list(df['turbine_id'].unique()) #list of turbines to plot\n",
    "list1=sorted(list1,key=int) #sort turbines in ascending order\n",
    "list2=list(df['TurbineCategory_id'].unique()) #list of categories \n",
    "list2=[g for g in list2 if g>=0] #remove NaN from list\n",
    "list2=sorted(list2,key=int) #sort categories in ascending order\n",
    "list2=[m for m in list2 if m not in (1,12,13,14,15,17,21,22)] #categories to remove \n",
    "list4=list(range(0,14))\n",
    "list5=list(zip(list4,list2))\n",
    "\n",
    "for x in list1: #filter only data for turbine x\n",
    "    dfx=df[(df['turbine_id']==x)].copy()\n",
    "    for y in list2: #copying fault to new column (mins) (fault when turbine category id is y)\n",
    "        def ff(c):\n",
    "            if c['TurbineCategory_id']==y:\n",
    "                return 0\n",
    "            else:\n",
    "                return 1\n",
    "        dfx['mins']=dfx.apply(ff,axis=1)\n",
    "        \n",
    "        dfx=dfx.sort_values(by=\"timestamp\",ascending=False) #sort values by timestamp in descending order\n",
    "        dfx.reset_index(drop=True,inplace=True) #reset index\n",
    "        \n",
    "        if dfx.loc[0,'mins']==0: #assigning value to first cell if it's not 0\n",
    "            dfx.set_value(0,'mins',0)\n",
    "        else:\n",
    "            dfx.set_value(0,'mins',999999999)\n",
    "\n",
    "        for i,e in enumerate(dfx['mins']): #using previous value's row to evaluate time\n",
    "            if e==1:\n",
    "                dfx.at[i,'mins']=dfx.at[i-1,'mins']+10\n",
    "\n",
    "        dfx=dfx.sort_values(by=\"timestamp\") #sort in ascending order\n",
    "        dfx.reset_index(drop=True,inplace=True) #reset index\n",
    "        dfx['hours']=dfx['mins'].astype(np.int64) #convert to hours, then round to nearest hour\n",
    "        dfx['hours']=dfx['hours']/60\n",
    "        dfx['hours']=round(dfx['hours']).astype(np.int64)\n",
    "        \n",
    "        def f11(c): #>48 hours - label as normal (9999)\n",
    "            if c['hours']>48:\n",
    "                return 9999\n",
    "            else:\n",
    "                return c['hours']\n",
    "        dfx['hours']=dfx.apply(f11,axis=1)\n",
    "        \n",
    "        def f22(c): #filter out curtailment - curtailed when turbine is pitching outside 0deg<= normal <=3.5deg\n",
    "            if 0<=c['pitch']<=3.5 or c['hours']!=9999 or ((c['pitch']>3.5 or c['pitch']<0) and \n",
    "                                                          (c['ap_av']<=(.1*dfx['ap_av'].max()) \n",
    "                                                           or c['ap_av']>=(.9*dfx['ap_av'].max()))):\n",
    "                return 'normal' \n",
    "            else:\n",
    "                return 'curtailed'\n",
    "        dfx['curtailment']=dfx.apply(f22,axis=1)\n",
    "\n",
    "        def f3(c): #filter unusual readings, i.e. for normal operation, power <=0 in operating wind speeds, power >100... \n",
    "                    #before cut-in, runtime <600 and other downtime categories\n",
    "            if c['hours']==9999 and ((3<c['ws_av']<25 and (c['ap_av']<=0 or c['runtime']<600 or \n",
    "                                                           c['EnvironmentalCategory_id']>1 or c['GridCategory_id']>1 or \n",
    "                                                           c['InfrastructureCategory_id']>1 or \n",
    "                                                           c['AvailabilityCategory_id']==2 or \n",
    "                                                           12<=c['TurbineCategory_id']<=15 or \n",
    "                                                           21<=c['TurbineCategory_id']<=22)) or \n",
    "                                                           (c['ws_av']<3 and c['ap_av']>100)): \n",
    "                return 'unusual' \n",
    "            else:\n",
    "                return 'normal'\n",
    "        dfx['unusual']=dfx.apply(f3,axis=1)\n",
    "\n",
    "        def f4(c): #round to 6 hour intervals\n",
    "            if c['hours']==0:\n",
    "                return 10\n",
    "            elif 1<=c['hours']<=6:\n",
    "                return 11\n",
    "            elif 7<=c['hours']<=12:\n",
    "                return 12\n",
    "            elif 13<=c['hours']<=18:\n",
    "                return 13\n",
    "            elif 19<=c['hours']<=24:\n",
    "                return 14\n",
    "            elif 25<=c['hours']<=30:\n",
    "                return 15\n",
    "            elif 31<=c['hours']<=36:\n",
    "                return 16\n",
    "            elif 37<=c['hours']<=42:\n",
    "                return 17\n",
    "            elif 43<=c['hours']<=48:\n",
    "                return 18\n",
    "            else:\n",
    "                return 19 #normal\n",
    "        dfx['hours6']=dfx.apply(f4,axis=1)\n",
    "        \n",
    "        def f5(c): #change label for unusual and curtailed data (20)\n",
    "            if c['unusual']=='unusual' or c['curtailment']=='curtailed':\n",
    "                return 20\n",
    "            else:\n",
    "                return c['hours6']\n",
    "        dfx['hours_%s'%y]=dfx.apply(f5,axis=1)\n",
    "    \n",
    "        dfx=dfx.drop('hours6',axis=1) #drop unnecessary columns\n",
    "        dfx=dfx.drop('hours',axis=1)\n",
    "        dfx=dfx.drop('mins',axis=1)\n",
    "        dfx=dfx.drop('curtailment',axis=1)\n",
    "        dfx=dfx.drop('unusual',axis=1)\n",
    "        \n",
    "    #separate features from classes for classification\n",
    "    features=['ap_av','ws_av','wd_av','pitch','ap_max','ap_dev','reactive_power','rs_av','gen_sp','nac_pos']\n",
    "    classes=[col for col in dfx.columns if 'hours' in col]\n",
    "    list6=features+classes #list of columns to copy into new df\n",
    "    df2=dfx[list6].copy()\n",
    "    df2=df2.dropna() #drop NaNs\n",
    "    X=df2[features] \n",
    "    X=preprocessing.normalize(X) #normalise features to values b/w 0 and 1\n",
    "    Y=df2[classes] \n",
    "    Y=Y.as_matrix() #convert from pd dataframe to np array\n",
    "    tscv=TimeSeriesSplit(n_splits=5) #cross validation using time series split\n",
    "    \n",
    "    dt=DecisionTreeClassifier(criterion='entropy')\n",
    "    for (m,n) in list5:\n",
    "        Ym=Y[:,m]\n",
    "        for train_index,test_index in tscv.split(X): #looping for each cross validation fold\n",
    "            X_train,X_test=X[train_index],X[test_index] #split train and test sets\n",
    "            Y_train,Y_test=Ym[train_index],Ym[test_index]\n",
    "            if len(set(Y_train))>1:\n",
    "                ros=RandomOverSampler()\n",
    "                Xt,Yt=ros.fit_sample(X_train,Y_train)\n",
    "            else:\n",
    "                Xt,Yt=X_train,Y_train\n",
    "            dt1=dt.fit(Xt,Yt) #fit to classifier and predict\n",
    "            Yp=dt1.predict(X_test) \n",
    "            print(\"Turbine %s with label index %s\"%(x,m))\n",
    "            print(\"Precision: %s; Recall: %s; F1 score: %s\"%(precision_score(Y_test,Yp,average='weighted'),\n",
    "                                                             recall_score(Y_test,Yp,average='weighted'),\n",
    "                                                             f1_score(Y_test,Yp,average='weighted')))\n",
    "        print(\"------------------------------------------------------------------------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}