{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Random forests: finding optimal criterion ('gini' or 'entropy')" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd #import libraries\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import TimeSeriesSplit\n", "\n", "#import data\n", "df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime.csv',skip_blank_lines=True)\n", "\n", "list1=list(df['turbine_id'].unique()) #list of turbines to plot\n", "list1=sorted(list1,key=int) #sort turbines in ascending order\n", "list2=list(df['TurbineCategory_id'].unique()) #list of categories \n", "list2=[g for g in list2 if g>=0] #remove NaN from list\n", "list2=sorted(list2,key=int) #sort categories in ascending order\n", "list2=[m for m in list2 if m not in (1,12,13,14,15,17,21,22)] #categories to remove \n", "num=[] #empty list to hold optimal n values for all turbines\n", "err=[] #empty list to hold minimum error readings for all turbines\n", "\n", "for x in list1: #filter only data for turbine x\n", " dfx=df[(df['turbine_id']==x)].copy()\n", " for y in list2: #copying fault to new column (mins) (fault when turbine category id is y)\n", " def f(c):\n", " if c['TurbineCategory_id']==y:\n", " return 0\n", " else:\n", " return 1\n", " dfx['mins']=dfx.apply(f,axis=1)\n", " \n", " dfx=dfx.sort_values(by=\"timestamp\",ascending=False) #sort values by timestamp in descending order\n", " dfx.reset_index(drop=True,inplace=True) #reset index\n", " \n", " if dfx.loc[0,'mins']==0: #assigning value to first cell if it's not 0 with a large number \n", " dfx.set_value(0,'mins',0)\n", " else:\n", " dfx.set_value(0,'mins',999999999) #to allow the following loop to work\n", "\n", " for i,e in enumerate(dfx['mins']): #using previous value's row to evaluate time\n", " if e==1:\n", " dfx.at[i,'mins']=dfx.at[i-1,'mins']+10\n", "\n", " dfx=dfx.sort_values(by=\"timestamp\") #sort in ascending order\n", " dfx.reset_index(drop=True,inplace=True) #reset index\n", " dfx['hours']=dfx['mins'].astype(np.int64) #convert to hours, then round to nearest hour\n", " dfx['hours']=dfx['hours']/60\n", " dfx['hours']=round(dfx['hours']).astype(np.int64) #round to integer\n", " \n", " def f1(c): #>48 hours - label as normal (999)\n", " if c['hours']>48:\n", " return 999\n", " else:\n", " return c['hours']\n", " dfx['hours']=dfx.apply(f1,axis=1)\n", " \n", " def f2(c): #filter out curtailment - curtailed when turbine is pitching outside 0deg<= normal <=3.5deg\n", " if 0<=c['pitch']<=3.5 or c['hours']!=999 or ((c['pitch']>3.5 or c['pitch']<0) and \n", " (c['ap_av']<=(.1*dfx['ap_av'].max()) \n", " or c['ap_av']>=(.9*dfx['ap_av'].max()))):\n", " return 'normal' \n", " else:\n", " return 'curtailed'\n", " dfx['curtailment']=dfx.apply(f2,axis=1)\n", "\n", " def f3(c): #filter unusual readings, i.e. for normal operation, power <=0 in operating wind speeds, power >100... \n", " #before cut-in, runtime <600 and other downtime categories\n", " if c['hours']==999 and ((31 or c['GridCategory_id']>1 or \n", " c['InfrastructureCategory_id']>1 or \n", " c['AvailabilityCategory_id']==2 or \n", " 12<=c['TurbineCategory_id']<=15 or \n", " 21<=c['TurbineCategory_id']<=22)) or \n", " (c['ws_av']<3 and c['ap_av']>100)): \n", " return 'unusual' \n", " else:\n", " return 'normal'\n", " dfx['unusual']=dfx.apply(f3,axis=1)\n", "\n", " def f4(c): #round to 6 hour intervals to reduce number of classes\n", " if 1<=c['hours']<=6:\n", " return 6\n", " elif 7<=c['hours']<=12:\n", " return 12\n", " elif 13<=c['hours']<=18:\n", " return 18\n", " elif 19<=c['hours']<=24:\n", " return 24\n", " elif 25<=c['hours']<=30:\n", " return 30\n", " elif 31<=c['hours']<=36:\n", " return 36\n", " elif 37<=c['hours']<=42:\n", " return 42\n", " elif 43<=c['hours']<=48:\n", " return 48\n", " else:\n", " return c['hours']\n", " dfx['hours6']=dfx.apply(f4,axis=1) \n", " \n", " def f5(c): #change label for unusual and curtailed data (9999), if originally labelled as normal\n", " if c['unusual']=='unusual' or c['curtailment']=='curtailed':\n", " return 9999\n", " else:\n", " return c['hours6']\n", " dfx['hours_%s'%y]=dfx.apply(f5,axis=1) #apply to new column specific to each fault\n", " \n", " dfx=dfx.drop('hours6',axis=1) #drop unnecessary columns\n", " dfx=dfx.drop('hours',axis=1)\n", " dfx=dfx.drop('mins',axis=1)\n", " dfx=dfx.drop('curtailment',axis=1)\n", " dfx=dfx.drop('unusual',axis=1)\n", " \n", " features=['ap_av','ws_av','wd_av','pitch','ap_max','ap_dev','reactive_power','rs_av','gen_sp','nac_pos']\n", " classes=[col for col in dfx.columns if 'hours' in col] #separate features from classes for classification\n", " list3=features+classes+['timestamp'] #list of columns to copy into new df\n", " df2=dfx[list3].copy()\n", " df2=df2.dropna() #drop NaNs\n", " X=df2[features] \n", " X=preprocessing.normalize(X) #normalise features to values b/w 0 and 1\n", " Y=df2[classes] \n", " Y=Y.as_matrix() #convert from pd dataframe to np array\n", " \n", " criterion=['gini','entropy']\n", " scores=[]\n", " tscv=TimeSeriesSplit(n_splits=5) #cross validation using time series split\n", " for c in criterion: #looping for each value of n and defining random forest classifier\n", " rf=RandomForestClassifier(criterion=c,n_jobs=-1)\n", " p1=[] #empty list to hold score for each cross validation fold\n", " for train_index,test_index in tscv.split(X): #looping for each cross validation fold\n", " X_train,X_test=X[train_index],X[test_index] #split train and test sets\n", " Y_train,Y_test=Y[train_index],Y[test_index]\n", " rf1=rf.fit(X_train,Y_train) #fit to classifier and predict\n", " pred=rf1.predict(X_test)\n", " p2=np.sum(np.equal(Y_test,pred))/Y_test.size #accuracy score\n", " p1.append(p2) #add to list \n", " p=sum(p1)/len(p1) #average score across all cross validation folds\n", " scores.append(p)\n", " MSE=[1-x for x in scores] #changing to misclassification error\n", " optimal=criterion[MSE.index(min(MSE))] #determining best n\n", " num.append(optimal)\n", " err.append(min(MSE))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
criterionerrorturbine
0entropy0.1529141
1entropy0.1007772
2entropy0.0704363
3entropy0.0742874
4entropy0.0745645
5entropy0.2503216
6gini0.1249367
7entropy0.2622978
8gini0.0850259
9entropy0.06700010
10entropy0.17870811
11entropy0.29471512
12entropy0.08169613
13entropy0.85016114
14entropy0.11580015
15entropy0.11105516
16entropy0.10347617
17entropy0.72318818
18entropy0.07221919
19entropy0.47486620
20entropy0.57672921
21entropy0.06335022
22entropy0.38810223
23entropy0.38181624
24entropy0.64188825
\n", "
" ], "text/plain": [ " criterion error turbine\n", "0 entropy 0.152914 1\n", "1 entropy 0.100777 2\n", "2 entropy 0.070436 3\n", "3 entropy 0.074287 4\n", "4 entropy 0.074564 5\n", "5 entropy 0.250321 6\n", "6 gini 0.124936 7\n", "7 entropy 0.262297 8\n", "8 gini 0.085025 9\n", "9 entropy 0.067000 10\n", "10 entropy 0.178708 11\n", "11 entropy 0.294715 12\n", "12 entropy 0.081696 13\n", "13 entropy 0.850161 14\n", "14 entropy 0.115800 15\n", "15 entropy 0.111055 16\n", "16 entropy 0.103476 17\n", "17 entropy 0.723188 18\n", "18 entropy 0.072219 19\n", "19 entropy 0.474866 20\n", "20 entropy 0.576729 21\n", "21 entropy 0.063350 22\n", "22 entropy 0.388102 23\n", "23 entropy 0.381816 24\n", "24 entropy 0.641888 25" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d=pd.DataFrame(num,columns=['criterion'])\n", "d['error']=err\n", "d['turbine']=list1\n", "d" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }