{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Decision trees\n", "## Balanced, oversampling" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd #import libraries\n", "import numpy as np\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import TimeSeriesSplit\n", "from sklearn.metrics import precision_score\n", "from sklearn.metrics import recall_score\n", "from sklearn.metrics import f1_score\n", "from imblearn.over_sampling import RandomOverSampler\n", "\n", "#import data\n", "df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime.csv',skip_blank_lines=True)\n", "\n", "list1=list(df['turbine_id'].unique()) #list of turbines to plot\n", "list1=sorted(list1,key=int) #sort turbines in ascending order\n", "list2=list(df['TurbineCategory_id'].unique()) #list of categories \n", "list2=[g for g in list2 if g>=0] #remove NaN from list\n", "list2=sorted(list2,key=int) #sort categories in ascending order\n", "list2=[m for m in list2 if m not in (1,12,13,14,15,17,21,22)] #categories to remove \n", "list4=list(range(0,14))\n", "list5=list(zip(list4,list2))\n", "\n", "for x in list1: #filter only data for turbine x\n", " dfx=df[(df['turbine_id']==x)].copy()\n", " for y in list2: #copying fault to new column (mins) (fault when turbine category id is y)\n", " def ff(c):\n", " if c['TurbineCategory_id']==y:\n", " return 0\n", " else:\n", " return 1\n", " dfx['mins']=dfx.apply(ff,axis=1)\n", " \n", " dfx=dfx.sort_values(by=\"timestamp\",ascending=False) #sort values by timestamp in descending order\n", " dfx.reset_index(drop=True,inplace=True) #reset index\n", " \n", " if dfx.loc[0,'mins']==0: #assigning value to first cell if it's not 0\n", " dfx.set_value(0,'mins',0)\n", " else:\n", " dfx.set_value(0,'mins',999999999)\n", "\n", " for i,e in enumerate(dfx['mins']): #using previous value's row to evaluate time\n", " if e==1:\n", " dfx.at[i,'mins']=dfx.at[i-1,'mins']+10\n", "\n", " dfx=dfx.sort_values(by=\"timestamp\") #sort in ascending order\n", " dfx.reset_index(drop=True,inplace=True) #reset index\n", " dfx['hours']=dfx['mins'].astype(np.int64) #convert to hours, then round to nearest hour\n", " dfx['hours']=dfx['hours']/60\n", " dfx['hours']=round(dfx['hours']).astype(np.int64)\n", " \n", " def f11(c): #>48 hours - label as normal (9999)\n", " if c['hours']>48:\n", " return 9999\n", " else:\n", " return c['hours']\n", " dfx['hours']=dfx.apply(f11,axis=1)\n", " \n", " def f22(c): #filter out curtailment - curtailed when turbine is pitching outside 0deg<= normal <=3.5deg\n", " if 0<=c['pitch']<=3.5 or c['hours']!=9999 or ((c['pitch']>3.5 or c['pitch']<0) and \n", " (c['ap_av']<=(.1*dfx['ap_av'].max()) \n", " or c['ap_av']>=(.9*dfx['ap_av'].max()))):\n", " return 'normal' \n", " else:\n", " return 'curtailed'\n", " dfx['curtailment']=dfx.apply(f22,axis=1)\n", "\n", " def f3(c): #filter unusual readings, i.e. for normal operation, power <=0 in operating wind speeds, power >100... \n", " #before cut-in, runtime <600 and other downtime categories\n", " if c['hours']==9999 and ((31 or c['GridCategory_id']>1 or \n", " c['InfrastructureCategory_id']>1 or \n", " c['AvailabilityCategory_id']==2 or \n", " 12<=c['TurbineCategory_id']<=15 or \n", " 21<=c['TurbineCategory_id']<=22)) or \n", " (c['ws_av']<3 and c['ap_av']>100)): \n", " return 'unusual' \n", " else:\n", " return 'normal'\n", " dfx['unusual']=dfx.apply(f3,axis=1)\n", "\n", " def f4(c): #round to 6 hour intervals\n", " if c['hours']==0:\n", " return 10\n", " elif 1<=c['hours']<=6:\n", " return 11\n", " elif 7<=c['hours']<=12:\n", " return 12\n", " elif 13<=c['hours']<=18:\n", " return 13\n", " elif 19<=c['hours']<=24:\n", " return 14\n", " elif 25<=c['hours']<=30:\n", " return 15\n", " elif 31<=c['hours']<=36:\n", " return 16\n", " elif 37<=c['hours']<=42:\n", " return 17\n", " elif 43<=c['hours']<=48:\n", " return 18\n", " else:\n", " return 19 #normal\n", " dfx['hours6']=dfx.apply(f4,axis=1)\n", " \n", " def f5(c): #change label for unusual and curtailed data (20)\n", " if c['unusual']=='unusual' or c['curtailment']=='curtailed':\n", " return 20\n", " else:\n", " return c['hours6']\n", " dfx['hours_%s'%y]=dfx.apply(f5,axis=1)\n", " \n", " dfx=dfx.drop('hours6',axis=1) #drop unnecessary columns\n", " dfx=dfx.drop('hours',axis=1)\n", " dfx=dfx.drop('mins',axis=1)\n", " dfx=dfx.drop('curtailment',axis=1)\n", " dfx=dfx.drop('unusual',axis=1)\n", " \n", " #separate features from classes for classification\n", " features=['ap_av','ws_av','wd_av','pitch','ap_max','ap_dev','reactive_power','rs_av','gen_sp','nac_pos']\n", " classes=[col for col in dfx.columns if 'hours' in col]\n", " list6=features+classes #list of columns to copy into new df\n", " df2=dfx[list6].copy()\n", " df2=df2.dropna() #drop NaNs\n", " X=df2[features] \n", " X=preprocessing.normalize(X) #normalise features to values b/w 0 and 1\n", " Y=df2[classes] \n", " Y=Y.as_matrix() #convert from pd dataframe to np array\n", " tscv=TimeSeriesSplit(n_splits=5) #cross validation using time series split\n", " \n", " dt=DecisionTreeClassifier(criterion='entropy')\n", " for (m,n) in list5:\n", " Ym=Y[:,m]\n", " for train_index,test_index in tscv.split(X): #looping for each cross validation fold\n", " X_train,X_test=X[train_index],X[test_index] #split train and test sets\n", " Y_train,Y_test=Ym[train_index],Ym[test_index]\n", " if len(set(Y_train))>1:\n", " ros=RandomOverSampler()\n", " Xt,Yt=ros.fit_sample(X_train,Y_train)\n", " else:\n", " Xt,Yt=X_train,Y_train\n", " dt1=dt.fit(Xt,Yt) #fit to classifier and predict\n", " Yp=dt1.predict(X_test) \n", " print(\"Turbine %s with label index %s\"%(x,m))\n", " print(\"Precision: %s; Recall: %s; F1 score: %s\"%(precision_score(Y_test,Yp,average='weighted'),\n", " recall_score(Y_test,Yp,average='weighted'),\n", " f1_score(Y_test,Yp,average='weighted')))\n", " print(\"------------------------------------------------------------------------\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }