{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Combining multiple csv files containing data into a single file" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SCADA " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd #import libraries\n", "import numpy as np\n", "import itertools\n", "\n", "#import data\n", "dfa=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_SCADA.csv',skip_blank_lines=True)\n", "dfa1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_SCADA.csv',skip_blank_lines=True)\n", "dfa2=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/NS_SCADA_v2.csv',skip_blank_lines=True)\n", "dfa3=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/NS_SCADA_2017_v2.csv',skip_blank_lines=True)\n", "\n", "def f(c): #define function to merge some old rotor speed readings with new data\n", " if c['turbine']<=20:\n", " return c['rs_av']\n", "dfa['rs_av_old']=dfa.apply(f,axis=1)\n", "dfa1['rs_av_old']=dfa1.apply(f,axis=1)\n", "\n", "dfa=dfa.drop('rs_av',axis=1) #delete original columns\n", "dfa1=dfa1.drop('rs_av',axis=1)\n", "\n", "#concatenate two df to one\n", "df=pd.concat([dfa,dfa1]) #old SCADA\n", "df2=pd.concat([dfa2,dfa3]) #new SCADA\n", "del dfa,dfa1,dfa2,dfa3\n", "\n", "df2['timestamp']=pd.to_datetime(df2['timestamp']) #convert timestamp to datetime dtype\n", "df['timestamp']=pd.to_datetime(df['timestamp'],dayfirst=True)\n", "\n", "#filter data so that the latest timestamp is the same for both old and new datasets\n", "df2=df2[df2.timestamp<='2017-04-30 23:50:00.000']\n", "\n", "df2.rename(columns={'turbine_id':'turbine'},inplace=True) #rename columns \n", "df2.rename(columns={'rs_av':'rs_av_new'},inplace=True)\n", "\n", "df2=df2.sort_values(['timestamp','turbine']) #sort values and drop duplicates\n", "df=df.sort_values(['timestamp','turbine'])\n", "df2=df2.drop_duplicates(['timestamp','turbine'],keep='first')\n", "df=df.drop_duplicates(['timestamp','turbine'],keep='first')\n", "\n", "list1=list(pd.date_range('2014-11-01 00:00:00','2017-04-30 23:50:00',freq='10min')) #fill missing rows in time series\n", "list2=range(1,26)\n", "list3=list(itertools.product(list1,list2))\n", "df3=pd.DataFrame(list3)\n", "df3.columns=['timestamp','turbine']\n", "df2['ap_max']=df2['ap_max'].astype(np.float64)\n", "dfx=pd.merge(df,df3,on=['timestamp','turbine'],how='outer')\n", "del df,df3\n", "\n", "#merge old and new data by these columns\n", "dfx1=pd.merge(dfx,df2,on=['timestamp','turbine','ws_av','wd_av','ws_1','ws_2','wd_1','wd_2','gen_sp','pitch',\n", " 'reactive_power','ap_max','ap_dev','ap_av','nac_pos'],how='outer')\n", "del dfx,df2\n", "\n", "dfx1=dfx1.sort_values(['timestamp','turbine']) #sort and drop duplicates again\n", "dfx1=dfx1.drop_duplicates(['timestamp','turbine'],keep='first')\n", "\n", "def f2(c): #merge rotor speed readings\n", " if c['rs_av_new']>=0:\n", " return c['rs_av_new']\n", " else:\n", " return c['rs_av_old']\n", "dfx1['rs_av']=dfx1.apply(f2,axis=1)\n", "\n", "dfx1=dfx1.drop('rs_av_old',axis=1) #drop old columns and reset index\n", "dfx1=dfx1.drop('rs_av_new',axis=1)\n", "dfx1.reset_index(drop=True,inplace=True)\n", "\n", "dfx1.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA.csv',index=False) #write to new csv file\n", "del dfx1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merging SCADA and downtime" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd #import libraries\n", "\n", "#import downtime data\n", "dfb=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_downtime.csv',skip_blank_lines=True)\n", "dfb1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_downtime.csv',skip_blank_lines=True)\n", "\n", "df1=pd.concat([dfb,dfb1]) #concatenate \n", "del dfb,dfb1\n", "\n", "df1['timestamp_start']=pd.to_datetime(df1['timestamp_start']) #convert dtype object to datetime\n", "df1['timestamp_end']=pd.to_datetime(df1['timestamp_end'])\n", "\n", "df1['timestamp_start']=df1['timestamp_start'].dt.round('10min') #round to nearest 10 min\n", "df1['timestamp_end']=df1['timestamp_end'].dt.round('10min')\n", "\n", "df1['period']=df1['timestamp_end']-df1['timestamp_start'] #period\n", "\n", "#downtime ranges to everyten minutes between start and end timestamps\n", "df1=pd.concat([pd.DataFrame({'timestamp':pd.date_range(row.timestamp_start,row.timestamp_end,freq='10min'),\n", " 'turbine_id':row.turbine_id,'period':row.period,'TurbineCategory_id':row.TurbineCategory_id,\n", " 'EnvironmentalCategory_id':row.EnvironmentalCategory_id,\n", " 'InfrastructureCategory_id':row.InfrastructureCategory_id,\n", " 'GridCategory_id':row.GridCategory_id,\n", " 'AvailabilityCategory_id':row.AvailabilityCategory_id,'alarm_id':row.alarm_id,\n", " 'workorder_id':row.workorder_id,'comment':row.comment}, \n", " columns=['timestamp','turbine_id','period','TurbineCategory_id','EnvironmentalCategory_id',\n", " 'InfrastructureCategory_id','GridCategory_id','AvailabilityCategory_id','alarm_id',\n", " 'workorder_id','comment']) \n", " for i,row in df1.iterrows()],ignore_index=True)\n", "\n", "df1=df1.sort_values(['timestamp','turbine_id','period']) #sort and drop duplicates for same timestamp and turbine\n", "df1=df1.drop_duplicates(['timestamp','turbine_id'],keep='first')\n", "\n", "df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA.csv',skip_blank_lines=True) #import SCADA \n", "\n", "df=df.drop('ws_1',axis=1) #drop unnecessary columns\n", "df=df.drop('ws_2',axis=1)\n", "df=df.drop('wd_1',axis=1)\n", "df=df.drop('wd_2',axis=1)\n", "\n", "df['timestamp']=pd.to_datetime(df['timestamp'],dayfirst=True) #convert timestamp to datetime\n", "\n", "df['turbine_id']=df['turbine'] #copy turbine id to new column\n", "\n", "df2=pd.merge(df,df1,how='outer') #merge SCADA and downtime\n", "del df,df1\n", "\n", "#drop downtime entries with no SCADA readings - in case of duplicates\n", "df2=df2.drop(df2[(df2['turbine_id'].notnull())&(df2['turbine'].isnull())].index)\n", "\n", "df2=df2.drop('turbine',axis=1) #drop old turbine id column\n", "\n", "#write final dataframe to csv\n", "df2.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime-NEW.csv',index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Downtime" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dfb=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Last_six_months_downtime.csv',skip_blank_lines=True)\n", "dfb1=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/Prior_two_years_downtime.csv',skip_blank_lines=True)\n", "\n", "dfb1=dfb1.drop('AvailabilityCategory_id.1',axis=1)\n", "dfb1=dfb1.drop('EnvironmentalCategory_id.1',axis=1)\n", "dfb1=dfb1.drop('GridCategory_id.1',axis=1)\n", "dfb1=dfb1.drop('InfrastructureCategory_id.1',axis=1)\n", "dfb1=dfb1.drop('TurbineCategory_id.1',axis=1)\n", "dfb1=dfb1.drop('alarm_id.1',axis=1)\n", "dfb1=dfb1.drop('comment.1',axis=1)\n", "dfb1=dfb1.drop('id.1',axis=1)\n", "dfb1=dfb1.drop('workorder_id.1',axis=1)\n", "dfb1=dfb1.drop('turbine_id.1',axis=1)\n", "dfb1=dfb1.drop('timestamp_end.1',axis=1)\n", "dfb1=dfb1.drop('timestamp_start.1',axis=1)\n", "\n", "dfb['timestamp_start']=pd.to_datetime(dfb['timestamp_start'])\n", "dfb['timestamp_end']=pd.to_datetime(dfb['timestamp_end'])\n", "dfb1['timestamp_start']=pd.to_datetime(dfb1['timestamp_start'])\n", "dfb1['timestamp_end']=pd.to_datetime(dfb1['timestamp_end'])\n", "\n", "df1=pd.concat([dfb,dfb1]) \n", "\n", "del dfb,dfb1\n", "\n", "df1=df1.sort_values(['timestamp_start','timestamp_end','turbine_id'])\n", "\n", "df1.reset_index(drop=True,inplace=True)\n", "\n", "df1.to_csv('C:/Users/nithi/Google Drive/Python/Student Data/downtime.csv',index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime-NEW.csv',encoding=\"ISO-8859-1\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df.shape" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }