{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# k-NN: finding optimal weight function ('distance' or 'uniform')" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd #import libraries\n", "import numpy as np\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import TimeSeriesSplit\n", "\n", "#import data\n", "df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime.csv',skip_blank_lines=True)\n", "\n", "list1=list(df['turbine_id'].unique()) #list of turbines to plot\n", "list1=sorted(list1,key=int) #sort turbines in ascending order\n", "list2=list(df['TurbineCategory_id'].unique()) #list of categories \n", "list2=[g for g in list2 if g>=0] #remove NaN from list\n", "list2=sorted(list2,key=int) #sort categories in ascending order\n", "list2=[m for m in list2 if m not in (1,12,13,14,15,17,21,22)] #categories to remove \n", "num=[] #empty list to hold optimal n values for all turbines\n", "err=[] #empty list to hold minimum error readings for all turbines\n", "\n", "for x in list1: #filter only data for turbine x\n", " dfx=df[(df['turbine_id']==x)].copy()\n", " for y in list2: #copying fault to new column (mins) (fault when turbine category id is y)\n", " def f(c):\n", " if c['TurbineCategory_id']==y:\n", " return 0\n", " else:\n", " return 1\n", " dfx['mins']=dfx.apply(f,axis=1)\n", " \n", " dfx=dfx.sort_values(by=\"timestamp\",ascending=False) #sort values by timestamp in descending order\n", " dfx.reset_index(drop=True,inplace=True) #reset index\n", " \n", " if dfx.loc[0,'mins']==0: #assigning value to first cell if it's not 0\n", " dfx.set_value(0,'mins',0)\n", " else:\n", " dfx.set_value(0,'mins',999999999)\n", "\n", " for i,e in enumerate(dfx['mins']): #using previous value's row to evaluate time\n", " if e==1:\n", " dfx.at[i,'mins']=dfx.at[i-1,'mins']+10\n", "\n", " dfx=dfx.sort_values(by=\"timestamp\") #sort in ascending order\n", " dfx.reset_index(drop=True,inplace=True) #reset index\n", " dfx['hours']=dfx['mins'].astype(np.int64) #convert to hours, then round to nearest hour\n", " dfx['hours']=dfx['hours']/60\n", " dfx['hours']=round(dfx['hours']).astype(np.int64)\n", " \n", " def f1(c): #>48 hours - label as normal (999)\n", " if c['hours']>48:\n", " return 999\n", " else:\n", " return c['hours']\n", " dfx['hours']=dfx.apply(f1,axis=1)\n", " \n", " def f2(c): #filter out curtailment - curtailed when turbine is pitching outside 0deg<= normal <=3.5deg\n", " if 0<=c['pitch']<=3.5 or c['hours']!=999 or ((c['pitch']>3.5 or c['pitch']<0) and \n", " (c['ap_av']<=(.1*dfx['ap_av'].max()) \n", " or c['ap_av']>=(.9*dfx['ap_av'].max()))):\n", " return 'normal' \n", " else:\n", " return 'curtailed'\n", " dfx['curtailment']=dfx.apply(f2,axis=1)\n", "\n", " def f3(c): #filter unusual readings, i.e. for normal operation, power <=0 in operating wind speeds, power >100... \n", " #before cut-in, runtime <600 and other downtime categories\n", " if c['hours']==999 and ((31 or c['GridCategory_id']>1 or \n", " c['InfrastructureCategory_id']>1 or \n", " c['AvailabilityCategory_id']==2 or \n", " 12<=c['TurbineCategory_id']<=15 or \n", " 21<=c['TurbineCategory_id']<=22)) or \n", " (c['ws_av']<3 and c['ap_av']>100)): \n", " return 'unusual' \n", " else:\n", " return 'normal'\n", " dfx['unusual']=dfx.apply(f3,axis=1)\n", "\n", " def f4(c): #round to 6 hour intervals\n", " if 1<=c['hours']<=6:\n", " return 6\n", " elif 7<=c['hours']<=12:\n", " return 12\n", " elif 13<=c['hours']<=18:\n", " return 18\n", " elif 19<=c['hours']<=24:\n", " return 24\n", " elif 25<=c['hours']<=30:\n", " return 30\n", " elif 31<=c['hours']<=36:\n", " return 36\n", " elif 37<=c['hours']<=42:\n", " return 42\n", " elif 43<=c['hours']<=48:\n", " return 48\n", " else:\n", " return c['hours']\n", " dfx['hours6']=dfx.apply(f4,axis=1)\n", " \n", " def f5(c): #change label for unusual and curtailed data (9999)\n", " if c['unusual']=='unusual' or c['curtailment']=='curtailed':\n", " return 9999\n", " else:\n", " return c['hours6']\n", " dfx['hours_%s'%y]=dfx.apply(f5,axis=1)\n", " \n", " dfx=dfx.drop('hours6',axis=1) #drop unnecessary columns\n", " dfx=dfx.drop('hours',axis=1)\n", " dfx=dfx.drop('mins',axis=1)\n", " dfx=dfx.drop('curtailment',axis=1)\n", " dfx=dfx.drop('unusual',axis=1)\n", " \n", " #separate features from classes for classification\n", " features=['ap_av','ws_av','wd_av','pitch','ap_max','ap_dev','reactive_power','rs_av','gen_sp','nac_pos']\n", " classes=[col for col in dfx.columns if 'hours' in col]\n", " list3=features+classes+['timestamp'] #list of columns to copy into new df\n", " df2=dfx[list3].copy()\n", " df2=df2.dropna() #drop NaNs\n", " X=df2[features] \n", " X=preprocessing.normalize(X) #normalise features to values b/w 0 and 1\n", " Y=df2[classes] \n", " Y=Y.as_matrix() #convert from pd dataframe to np array\n", " \n", " weights=['uniform','distance'] #subsetting just the odd ones\n", " scores=[] #empty list that will hold average cross validation scores for each n\n", " tscv=TimeSeriesSplit(n_splits=5) #cross validation using time series split\n", " for w in weights: #looping for each value of n and defining random forest classifier\n", " knn=KNeighborsClassifier(weights=w,n_jobs=-1)\n", " p1=[] #empty list to hold score for each cross validation fold\n", " for train_index,test_index in tscv.split(X): #looping for each cross validation fold\n", " X_train,X_test=X[train_index],X[test_index] #split train and test sets\n", " Y_train,Y_test=Y[train_index],Y[test_index]\n", " knn1=knn.fit(X_train,Y_train) #fit to classifier and predict\n", " pred=knn1.predict(X_test) \n", " p2=np.sum(np.equal(Y_test,pred))/Y_test.size #accuracy score\n", " p1.append(p2) #add to list \n", " p=sum(p1)/len(p1) #average score across all cross validation folds\n", " scores.append(p)\n", " MSE=[1-x for x in scores] #changing to misclassification error\n", " optimal=weights[MSE.index(min(MSE))] #determining best n\n", " num.append(optimal)\n", " err.append(min(MSE))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weightserrorturbine
0distance0.2082271
1distance0.1341032
2distance0.1089013
3distance0.1250604
4distance0.0932055
5distance0.1166636
6distance0.2151007
7distance0.1458208
8distance0.1201589
9distance0.11741610
10distance0.13243611
11distance0.13828012
12distance0.14259513
13distance0.07437514
14uniform0.18136115
15distance0.15889416
16distance0.14980817
17distance0.11354718
18distance0.08619219
19distance0.14873220
20distance0.07682821
21distance0.07534722
22distance0.09016723
23distance0.16059224
24distance0.07742125
\n", "
" ], "text/plain": [ " weights error turbine\n", "0 distance 0.208227 1\n", "1 distance 0.134103 2\n", "2 distance 0.108901 3\n", "3 distance 0.125060 4\n", "4 distance 0.093205 5\n", "5 distance 0.116663 6\n", "6 distance 0.215100 7\n", "7 distance 0.145820 8\n", "8 distance 0.120158 9\n", "9 distance 0.117416 10\n", "10 distance 0.132436 11\n", "11 distance 0.138280 12\n", "12 distance 0.142595 13\n", "13 distance 0.074375 14\n", "14 uniform 0.181361 15\n", "15 distance 0.158894 16\n", "16 distance 0.149808 17\n", "17 distance 0.113547 18\n", "18 distance 0.086192 19\n", "19 distance 0.148732 20\n", "20 distance 0.076828 21\n", "21 distance 0.075347 22\n", "22 distance 0.090167 23\n", "23 distance 0.160592 24\n", "24 distance 0.077421 25" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d=pd.DataFrame(num,columns=['weights'])\n", "d['error']=err\n", "d['turbine']=list1\n", "d" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }