{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "fdd7071a-0bcf-48c5-9ed7-772ab69e59df"
    }
   },
   "source": [
    "<div style=\"width:100%; background-color: #D9EDF7; border: 1px solid #CFCFCF; text-align: left; padding: 10px;\">\n",
    "      <b>Weather Data: Renewables.ninja processing notebook</b>\n",
    "      <ul>\n",
    "        <li><a href=\"main.ipynb\">Main notebook</a></li>\n",
    "        <li><a href=\"download.ipynb\">Download weather data from Renewables.ninja</a></li>\n",
    "        <li>Process weather data from Renewables.ninja</li>\n",
    "        <li><a href=\"download_merra2.ipynb\">Example script to download arbitrary MERRA-2 data</a></li>\n",
    "      </ul>\n",
    "      <br>This Notebook is part of the <a href=\"http://data.open-power-system-data.org/weather_data\">Weather Data Package</a> of <a href=\"http://open-power-system-data.org\">Open Power System Data</a>.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import glob\n",
    "import os\n",
    "import sqlite3\n",
    "import hashlib\n",
    "import shutil\n",
    "\n",
    "import pandas as pd\n",
    "import geopandas as gp\n",
    "import gsee\n",
    "import tqdm\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "import generate_metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "version = '2019-04-09'\n",
    "changes = 'All European countries'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dir_shapefiles = os.path.join('downloads', 'shapefiles')\n",
    "dir_countries = os.path.join('downloads', 'countries')\n",
    "dir_nuts = os.path.join('downloads', 'nuts')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read and process data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if os.path.exists('cached_dataframe.csv'):\n",
    "    df_cached = pd.read_csv('cached_dataframe.csv', index_col=0, header=[0, 1], parse_dates=True)\n",
    "else:\n",
    "    df_cached = df_cached = pd.DataFrame({})  # Empty dataframe, to permit 'x in df_cached' tests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "dataframes = {}\n",
    "parse_kwargs = dict(skiprows=2, index_col=0, parse_dates=True)\n",
    "\n",
    "# Files of form `ninja_weather_country_AT_merra-2_population_weighted.csv`\n",
    "for f in glob.glob(os.path.join(dir_countries, '*.csv')):\n",
    "\n",
    "    country_code = f.split('_')[3]\n",
    "    \n",
    "    if country_code not in df_cached:\n",
    "        df = pd.read_csv(f, **parse_kwargs)\n",
    "        dataframes[country_code] = df\n",
    "    \n",
    "# Files of form `ninja_weather_irradiance_surface_country_DE_merra-2_nuts-2_population_weighted.csv`\n",
    "for f in glob.glob(os.path.join(dir_nuts, '*.csv')):\n",
    "\n",
    "    country_code = f.split('country_')[1][0:2]\n",
    "    variable = f.split('weather_')[1].split('_country')[0]\n",
    "\n",
    "    df = pd.read_csv(f, **parse_kwargs)\n",
    "    df = df.rename(columns={country_code + '_TOTAL': country_code})\n",
    "\n",
    "    for c in df.columns:\n",
    "        \n",
    "        if c not in df_cached:\n",
    "        \n",
    "            if c not in dataframes:\n",
    "                dataframes[c] = pd.DataFrame({variable: df[c]})\n",
    "            else:\n",
    "                dataframes[c].loc[:, variable] = df[c]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Estimate direct and diffuse radiation using the BRL model as implented in GSEE.\n",
    "# https://github.com/renewables-ninja/gsee\n",
    "\n",
    "# Warning: this code takes a while to execute (easily 1-2mins CPU time per location).\n",
    "\n",
    "nuts_centroids = gp.GeoDataFrame.from_file(os.path.join(dir_shapefiles, 'NUTS_LB_2016_4326.shp'))\n",
    "nuts_centroids.set_index('NUTS_ID', inplace=True)\n",
    "\n",
    "# Map GB to UK, GR to EL\n",
    "nuts_centroids.loc['GB', : ] = nuts_centroids.loc['UK', :]\n",
    "nuts_centroids.loc['GR', : ] = nuts_centroids.loc['EL', :]\n",
    "\n",
    "data = {k: {\n",
    "    'clearness': dataframes[k]['irradiance_surface'] / dataframes[k]['irradiance_toa'],\n",
    "    'centroid': list(nuts_centroids.loc[k, 'geometry'].coords)[0][::-1],\n",
    "    } for k in dataframes.keys()}\n",
    "\n",
    "\n",
    "def process_item(item):\n",
    "    return gsee.brl_model.run(hourly_clearness=item['clearness'], coords=item['centroid'])\n",
    "\n",
    "\n",
    "result = Parallel(n_jobs=-1)(delayed(process_item)(item) for item in tqdm.tqdm(data.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, k in enumerate(data.keys()):\n",
    "    df = dataframes[k]\n",
    "    diffuse_fraction = result[i]\n",
    "    df['radiation_direct_horizontal'] = ((1 - diffuse_fraction) * df['irradiance_surface']).fillna(0)\n",
    "    df['radiation_diffuse_horizontal'] = (diffuse_fraction * df['irradiance_surface']).fillna(0)\n",
    "    dataframes[k] = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "variables = ['windspeed_10m', 'temperature', 'radiation_direct_horizontal', 'radiation_diffuse_horizontal']\n",
    "\n",
    "for k in dataframes.keys():\n",
    "    dataframes[k] = dataframes[k].loc[:, [v for v in variables if v in dataframes[k].columns]]\n",
    "\n",
    "if len(dataframes) > 0:\n",
    "    \n",
    "    complete_data = pd.concat(dataframes, axis=1, join='inner')\n",
    "\n",
    "    df = pd.concat([complete_data, df_cached], axis=1)\n",
    "\n",
    "    df.columns = pd.MultiIndex.from_tuples(\n",
    "        [(i[0], i[1]) for i in df.columns],\n",
    "        names=['geography', 'variable']\n",
    "    )\n",
    "\n",
    "    df.index.name = 'utc_timestamp'\n",
    "\n",
    "    df.to_csv('cached_dataframe.csv')\n",
    "    \n",
    "else:\n",
    "    \n",
    "    df = df_cached"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "fa919796-a7f6-4556-aeed-181ddc6028ac"
    }
   },
   "source": [
    "# Write data to disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(version, exist_ok=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "47c1efa2-d93f-4d13-81d7-8f64dadeff3f"
    }
   },
   "source": [
    "## Reshape data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "a753ac43-a0f4-44bc-a89d-1ccaaf48289a"
    }
   },
   "source": [
    "Data are provided in two different \"shapes\": \n",
    "- SingleIndex (easy to read for humans, compatible with datapackage standard, small file size)\n",
    "  - File formats: CSV, SQLite\n",
    "- MultiIndex (easy to read into GAMS, not compatible with datapackage standard, small file size)\n",
    "  - File formats: CSV, Excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "769225c6-31f5-4db8-8d91-32a3f983489c"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_multiindex = df\n",
    "\n",
    "df_singleindex = df.copy()\n",
    "\n",
    "df_singleindex.columns = [\n",
    "    '_'.join([level for level in list(col)])\n",
    "    for col in df.columns.values\n",
    "]\n",
    "\n",
    "df_stacked = df.copy()\n",
    "df_stacked = df_stacked.transpose().stack(dropna=True).to_frame(name='data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "datetime_format = '%Y-%m-%dT%H:%M:%SZ'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "84f1822e-3aa6-42c4-a424-5dc5ab6fa56f"
    }
   },
   "source": [
    "## Write to SQLite database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "fd35212c-ec5c-4fcf-9897-4608742d1bf8"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# SQLite is required for the filtering function on the OPSD website\n",
    "\n",
    "df_sqlite = df_singleindex.copy()\n",
    "df_sqlite.index = df_sqlite.index.strftime(datetime_format)\n",
    "filepath = os.path.join(version, 'weather_data.sqlite')\n",
    "\n",
    "if os.path.exists(filepath):\n",
    "    os.unlink(filepath)\n",
    "\n",
    "df_sqlite.to_sql(\n",
    "    'weather_data',\n",
    "    sqlite3.connect(filepath),\n",
    "    if_exists='replace',\n",
    "    index_label='utc_timestamp'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "454ee5f5-e8f1-4088-94e9-e846f48ee75b"
    }
   },
   "source": [
    "## Write to CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "89449c49-608d-488d-8bc8-077c64bc26c7"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "csv_kwargs = dict(\n",
    "    float_format='%.4f',\n",
    "    date_format=datetime_format\n",
    ")\n",
    "\n",
    "df_singleindex.to_csv(\n",
    "    os.path.join(version, 'weather_data.csv'),\n",
    "    **csv_kwargs\n",
    ")\n",
    "\n",
    "df_multiindex.to_csv(\n",
    "    os.path.join(version, 'weather_data_multiindex.csv'),\n",
    "    **csv_kwargs\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# See generate_metadata.py for details\n",
    "generate_metadata.generate_json(df_multiindex, version, changes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write checksums.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We publish SHA checksums for the output files on GitHub to allow verifying their integrity on the OPSD server."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_sha_hash(path, blocksize=65536):\n",
    "    sha_hasher = hashlib.sha256()\n",
    "    with open(path, 'rb') as f:\n",
    "        buffer = f.read(blocksize)\n",
    "        while len(buffer) > 0:\n",
    "            sha_hasher.update(buffer)\n",
    "            buffer = f.read(blocksize)\n",
    "        return sha_hasher.hexdigest()\n",
    "\n",
    "\n",
    "checksum_file_path = os.path.join(version, 'checksums.txt')\n",
    "files = glob.glob(os.path.join(version, 'weather_data*'))\n",
    "\n",
    "# Create checksums.txt in the version directory\n",
    "with open(checksum_file_path, 'w') as f:\n",
    "    for this_file in files:\n",
    "        file_hash = get_sha_hash(this_file)\n",
    "        f.write('{},{}\\n'.format(os.path.basename(this_file), file_hash))\n",
    "\n",
    "# Copy the file to root directory from where it will be pushed to GitHub\n",
    "shutil.copyfile(checksum_file_path, 'checksums.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "celltoolbar": "Initialisation Cell",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "nbpresent": {
   "slides": {
    "f6b300bf-88b5-4dea-951e-c926a9ea8287": {
     "id": "f6b300bf-88b5-4dea-951e-c926a9ea8287",
     "prev": "f96dd4bc-93a6-4014-b85f-a43061cf5688",
     "regions": {
      "dc486e18-7547-4610-99c0-55dfb5553f62": {
       "attrs": {
        "height": 1,
        "width": 1,
        "x": 0,
        "y": 0
       },
       "content": {
        "cell": "c0035fc6-ff1d-44d8-a3fd-b4c08f53be71",
        "part": "source"
       },
       "id": "dc486e18-7547-4610-99c0-55dfb5553f62"
      }
     }
    },
    "f96dd4bc-93a6-4014-b85f-a43061cf5688": {
     "id": "f96dd4bc-93a6-4014-b85f-a43061cf5688",
     "prev": null,
     "regions": {
      "657c3ad3-2fcf-4c8e-a527-de3d0a46fa4e": {
       "attrs": {
        "height": 1,
        "width": 1,
        "x": 0,
        "y": 0
       },
       "content": {
        "cell": "1562965a-7d74-4c1c-8251-4d82847f294a",
        "part": "source"
       },
       "id": "657c3ad3-2fcf-4c8e-a527-de3d0a46fa4e"
      }
     }
    }
   },
   "themes": {}
  },
  "notify_time": "10",
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "156px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}