{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd, numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "ro=['5765358043206','9812808043220','9576658043223','1699958043227','7225068043228','265208043229']\n", "hu=['8073718043234','2087988043232','6247548043235']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "p='C:/Users/csala/Onedrive - Lancaster University/Datarepo/szekelydata/klima/'" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5765358043206\n", "9812808043220\n", "9576658043223\n", "1699958043227\n", "7225068043228\n", "265208043229\n" ] } ], "source": [ "stations=[]\n", "for i in ro:\n", " stations.append(pd.read_csv(p+'high_res/raw/ro/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))\n", " print(i)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8073718043234\n", "2087988043232\n", "6247548043235\n" ] } ], "source": [ "for i in hu:\n", " stations.append(pd.read_csv(p+'high_res/raw/hu/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))\n", " print(i)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "station=pd.concat(stations)\n", "station=station.drop_duplicates()\n", "station[2]=station[2].str.strip()\n", "station[3]=station[3].str.strip()\n", "station=station[[0,2,3,4,5,6]]\n", "station.columns=['ID','LOC','COUNTRY','LAT','LON','ELEVATION']\n", "station.to_csv(p+'stations.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**!!! 16G memory required at least, 64G recommended**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "RO" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "5765358043206\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "9812808043220\n", "9576658043223\n", "1699958043227\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (4,6,11,12,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "7225068043228\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "265208043229\n" ] } ], "source": [ "dfs=[]\n", "for i in ro:\n", " df=pd.read_csv(p+'high_res/raw/ro/'+i+'dat.txt',delimiter= '\\s+')\n", " dfs.append(df)\n", " print(i)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "dfz=pd.concat(dfs)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "dfs=None #free memory\n", "df=None #free memory" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
USAFWBANYR--MODAHRMNDIRSPDGUSCLGSKCLM...SLPALTSTPMAXMINPCP01PCP06PCP24PCPXXSD
0119000999992012050100000402***********...1017.6*****984.8***********0.00************
1119000999992012050101000402***********...1017.8*****984.9****************************
211900099999201205010200***0***********...1018.1*****985****************************
3119000999992012050103000302***********...1018.3*****985.2****************************
411900099999201205010400***0***********...1018.8*****985.6****************************
\n", "

5 rows × 33 columns

\n", "
" ], "text/plain": [ " USAF WBAN YR--MODAHRMN DIR SPD GUS CLG SKC L M ... SLP \\\n", "0 119000 99999 201205010000 040 2 *** *** *** * * ... 1017.6 \n", "1 119000 99999 201205010100 040 2 *** *** *** * * ... 1017.8 \n", "2 119000 99999 201205010200 *** 0 *** *** *** * * ... 1018.1 \n", "3 119000 99999 201205010300 030 2 *** *** *** * * ... 1018.3 \n", "4 119000 99999 201205010400 *** 0 *** *** *** * * ... 1018.8 \n", "\n", " ALT STP MAX MIN PCP01 PCP06 PCP24 PCPXX SD \n", "0 ***** 984.8 *** *** ***** 0.00 ***** ***** ** \n", "1 ***** 984.9 *** *** ***** ***** ***** ***** ** \n", "2 ***** 985 *** *** ***** ***** ***** ***** ** \n", "3 ***** 985.2 *** *** ***** ***** ***** ***** ** \n", "4 ***** 985.6 *** *** ***** ***** ***** ***** ** \n", "\n", "[5 rows x 33 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfz.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "#!!!! DO NOT DO THIS\n", "#dfz=dfz.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')\n", "dfz['year']=dfz['time'].dt.year\n", "dfz['month']=dfz['time'].dt.month\n", "dfz['day']=dfz['time'].dt.day\n", "dfz['hour']=dfz['time'].dt.hour" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "#keep only months with at least 6 days to avoid anomalies (20%)\n", "filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()\n", "filt2=filt[filt>5].dropna()\n", "#keep only years with at least 3 months to avoid anomalies (20%)\n", "filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()\n", "filt4=filt3[filt3>3].dropna()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
month
USAFyear
11900020128.0
201312.0
201412.0
201512.0
20164.0
\n", "
" ], "text/plain": [ " month\n", "USAF year \n", "119000 2012 8.0\n", " 2013 12.0\n", " 2014 12.0\n", " 2015 12.0\n", " 2016 4.0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filt4.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "119000\n", "150000\n", "150001\n", "150002\n", "150010\n", "150040\n", "150070\n", "150090\n", "150100\n", "150105\n", "150140\n", "150150\n", "150200\n", "150230\n", "150235\n", "150250\n", "150320\n", "150330\n", "150400\n", "150410\n", "150420\n", "150440\n", "150470\n", "150520\n", "150550\n", "150560\n", "150630\n", "150690\n", "150730\n", "150750\n", "150800\n", "150830\n", "150850\n", "150880\n", "150890\n", "150900\n", "150940\n", "150950\n", "150990\n", "151070\n", "151080\n", "151090\n", "151110\n", "151130\n", "151170\n", "151180\n", "151190\n", "151200\n", "151205\n", "151230\n", "151240\n", "151270\n", "151320\n", "151340\n", "151360\n", "151380\n", "151400\n", "151430\n", "151450\n", "151455\n", "151480\n", "151500\n", "151540\n", "151580\n", "151590\n", "151600\n", "151620\n", "151630\n", "151650\n", "151680\n", "151700\n", "151740\n", "151790\n", "151820\n", "151840\n", "151890\n", "151940\n", "151970\n", "151990\n", "152000\n", "152005\n", "152040\n", "152060\n", "152080\n", "152090\n", "152120\n", "152150\n", "152170\n", "152190\n", "152210\n", "152300\n", "152310\n", "152350\n", "152380\n", "152410\n", "152450\n", "152470\n", "152540\n", "152590\n", "152600\n", "152610\n", "152620\n", "152640\n", "152650\n", "152670\n", "152700\n", "152730\n", "152770\n", "152790\n", "152800\n", "152820\n", "152840\n", "152850\n", "152870\n", "152890\n", "152920\n", "152960\n", "152970\n", "152980\n", "152990\n", "153000\n", "153010\n", "153020\n", "153070\n", "153100\n", "153140\n", "153150\n", "153160\n", "153170\n", "153190\n", "153200\n", "153210\n", "153240\n", "153250\n", "153280\n", "153330\n", "153350\n", "153355\n", "153360\n", "153370\n", "153380\n", "153400\n", "153410\n", "153440\n", "153450\n", "153460\n", "153470\n", "153490\n", "153500\n", "153550\n", "153560\n", "153600\n", "153630\n", "153640\n", "153660\n", "153690\n", "153730\n", "153750\n", "153770\n", "153870\n", "153880\n", "153890\n", "153950\n", "154020\n", "154050\n", "154060\n", "154080\n", "154090\n", "154100\n", "154120\n", "154160\n", "154190\n" ] } ], "source": [ "for stn in filt4.index.unique(0):\n", " years=filt4.loc[stn].index.unique()\n", " d=dfz[dfz['USAF']==stn]\n", " d=d[d['year'].isin(years)]\n", " d.to_csv(p+'high_res/export/'+str(stn)+'.csv')\n", " print(stn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "HU" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (4,11,12,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "8073718043234\n", "2087988043232\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (4,11,12,20,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "6247548043235\n" ] } ], "source": [ "dfs=[]\n", "for i in hu:\n", " df=pd.read_csv(p+'high_res/raw/hu/'+i+'dat.txt',delimiter= '\\s+')\n", " dfs.append(df)\n", " print(i)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "dfz=pd.concat(dfs)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "dfs=None #free memory\n", "df=None #free memory" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')\n", "dfz['year']=dfz['time'].dt.year\n", "dfz['month']=dfz['time'].dt.month\n", "dfz['day']=dfz['time'].dt.day\n", "dfz['hour']=dfz['time'].dt.hour" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "#keep only months with at least 6 days to avoid anomalies (20%)\n", "filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()\n", "filt2=filt[filt>5].dropna()\n", "#keep only years with at least 3 months to avoid anomalies (20%)\n", "filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()\n", "filt4=filt3[filt3>3].dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "127560\n", "127660\n", "127720\n", "127860\n", "128050\n", "128120\n" ] } ], "source": [ "for stn in filt4.index.unique(0):\n", " years=filt4.loc[stn].index.unique()\n", " d=dfz[dfz['USAF']==stn]\n", " d=d[d['year'].isin(years)]\n", " d.to_csv(p+'high_res/export/'+str(stn)+'.csv')\n", " print(stn)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }