{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "fdd7071a-0bcf-48c5-9ed7-772ab69e59df"
    }
   },
   "source": [
    "<table style=\"width:100%; background-color: #EBF5FB\">\n",
    "  <tr>\n",
    "    <td style=\"border: 1px solid #CFCFCF\">\n",
    "      <b>Time series: Processing Notebook</b>\n",
    "      <ul>\n",
    "        <li><a href=\"main.ipynb\">Main Notebook</a></li>\n",
    "        <li>Processing Notebook</li>\n",
    "      </ul>\n",
    "      <br>This Notebook is part of the <a href=\"http://data.open-power-system-data.org/time_series\">Time series Data Package</a> of <a href=\"http://open-power-system-data.org\">Open Power System Data</a>.\n",
    "    </td>\n",
    "  </tr>\n",
    "</table>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": "true"
   },
   "source": [
    "# Table of Contents\n",
    " <p><div class=\"lev1 toc-item\"><a href=\"#Introductory-Notes\" data-toc-modified-id=\"Introductory-Notes-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Introductory Notes</a></div><div class=\"lev1 toc-item\"><a href=\"#Settings\" data-toc-modified-id=\"Settings-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Settings</a></div><div class=\"lev2 toc-item\"><a href=\"#Set-version-number-and-recent-changes\" data-toc-modified-id=\"Set-version-number-and-recent-changes-21\"><span class=\"toc-item-num\">2.1&nbsp;&nbsp;</span>Set version number and recent changes</a></div><div class=\"lev2 toc-item\"><a href=\"#Import-Python-libraries\" data-toc-modified-id=\"Import-Python-libraries-22\"><span class=\"toc-item-num\">2.2&nbsp;&nbsp;</span>Import Python libraries</a></div><div class=\"lev2 toc-item\"><a href=\"#Set-directories\" data-toc-modified-id=\"Set-directories-23\"><span class=\"toc-item-num\">2.3&nbsp;&nbsp;</span>Set directories</a></div><div class=\"lev2 toc-item\"><a href=\"#Set-up-a-log\" data-toc-modified-id=\"Set-up-a-log-24\"><span class=\"toc-item-num\">2.4&nbsp;&nbsp;</span>Set up a log</a></div><div class=\"lev2 toc-item\"><a href=\"#Select-timerange\" data-toc-modified-id=\"Select-timerange-25\"><span class=\"toc-item-num\">2.5&nbsp;&nbsp;</span>Select timerange</a></div><div class=\"lev2 toc-item\"><a href=\"#Select-download-source\" data-toc-modified-id=\"Select-download-source-26\"><span class=\"toc-item-num\">2.6&nbsp;&nbsp;</span>Select download source</a></div><div class=\"lev2 toc-item\"><a href=\"#Select-subset\" data-toc-modified-id=\"Select-subset-27\"><span class=\"toc-item-num\">2.7&nbsp;&nbsp;</span>Select subset</a></div><div class=\"lev1 toc-item\"><a href=\"#Download\" data-toc-modified-id=\"Download-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Download</a></div><div class=\"lev2 toc-item\"><a href=\"#Automatic-download-(for-all-sources-except-Energinet.dk)\" data-toc-modified-id=\"Automatic-download-(for-all-sources-except-Energinet.dk)-31\"><span class=\"toc-item-num\">3.1&nbsp;&nbsp;</span>Automatic download (for all sources except Energinet.dk)</a></div><div class=\"lev2 toc-item\"><a href=\"#Manual-download-(Energinet.dk)\" data-toc-modified-id=\"Manual-download-(Energinet.dk)-32\"><span class=\"toc-item-num\">3.2&nbsp;&nbsp;</span>Manual download (Energinet.dk)</a></div><div class=\"lev1 toc-item\"><a href=\"#Read\" data-toc-modified-id=\"Read-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Read</a></div><div class=\"lev2 toc-item\"><a href=\"#Preparations\" data-toc-modified-id=\"Preparations-41\"><span class=\"toc-item-num\">4.1&nbsp;&nbsp;</span>Preparations</a></div><div class=\"lev2 toc-item\"><a href=\"#Reading-loop\" data-toc-modified-id=\"Reading-loop-42\"><span class=\"toc-item-num\">4.2&nbsp;&nbsp;</span>Reading loop</a></div><div class=\"lev2 toc-item\"><a href=\"#Save-raw-data\" data-toc-modified-id=\"Save-raw-data-43\"><span class=\"toc-item-num\">4.3&nbsp;&nbsp;</span>Save raw data</a></div><div class=\"lev1 toc-item\"><a href=\"#Processing\" data-toc-modified-id=\"Processing-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>Processing</a></div><div class=\"lev2 toc-item\"><a href=\"#Missing-data-handling\" data-toc-modified-id=\"Missing-data-handling-51\"><span class=\"toc-item-num\">5.1&nbsp;&nbsp;</span>Missing data handling</a></div><div class=\"lev3 toc-item\"><a href=\"#Interpolation\" data-toc-modified-id=\"Interpolation-511\"><span class=\"toc-item-num\">5.1.1&nbsp;&nbsp;</span>Interpolation</a></div><div class=\"lev2 toc-item\"><a href=\"#Country-specific-calculations\" data-toc-modified-id=\"Country-specific-calculations-52\"><span class=\"toc-item-num\">5.2&nbsp;&nbsp;</span>Country specific calculations</a></div><div class=\"lev3 toc-item\"><a href=\"#Calculate-onshore-wind-generation-for-German-TSOs\" data-toc-modified-id=\"Calculate-onshore-wind-generation-for-German-TSOs-521\"><span class=\"toc-item-num\">5.2.1&nbsp;&nbsp;</span>Calculate onshore wind generation for German TSOs</a></div><div class=\"lev3 toc-item\"><a href=\"#Calculate-aggregate-wind-capacity-for-germany-(on-+-offshore)\" data-toc-modified-id=\"Calculate-aggregate-wind-capacity-for-germany-(on-+-offshore)-522\"><span class=\"toc-item-num\">5.2.2&nbsp;&nbsp;</span>Calculate aggregate wind capacity for germany (on + offshore)</a></div><div class=\"lev3 toc-item\"><a href=\"#Aggregate-German-data-from-individual-TSOs-and-calculate-availabilities/profiles\" data-toc-modified-id=\"Aggregate-German-data-from-individual-TSOs-and-calculate-availabilities/profiles-523\"><span class=\"toc-item-num\">5.2.3&nbsp;&nbsp;</span>Aggregate German data from individual TSOs and calculate availabilities/profiles</a></div><div class=\"lev2 toc-item\"><a href=\"#Create-hourly-data-from-15'-data\" data-toc-modified-id=\"Create-hourly-data-from-15'-data-53\"><span class=\"toc-item-num\">5.3&nbsp;&nbsp;</span>Create hourly data from 15' data</a></div><div class=\"lev2 toc-item\"><a href=\"#Fill-columns-not-retrieved-directly-from-TSO-webites-with--ENTSO-E-Transparency-data\" data-toc-modified-id=\"Fill-columns-not-retrieved-directly-from-TSO-webites-with--ENTSO-E-Transparency-data-54\"><span class=\"toc-item-num\">5.4&nbsp;&nbsp;</span>Fill columns not retrieved directly from TSO webites with  ENTSO-E Transparency data</a></div><div class=\"lev2 toc-item\"><a href=\"#Insert-a-column-with-Central-European-(Summer-)time\" data-toc-modified-id=\"Insert-a-column-with-Central-European-(Summer-)time-55\"><span class=\"toc-item-num\">5.5&nbsp;&nbsp;</span>Insert a column with Central European (Summer-)time</a></div><div class=\"lev1 toc-item\"><a href=\"#Create-metadata\" data-toc-modified-id=\"Create-metadata-6\"><span class=\"toc-item-num\">6&nbsp;&nbsp;</span>Create metadata</a></div><div class=\"lev1 toc-item\"><a href=\"#Write-data-to-disk\" data-toc-modified-id=\"Write-data-to-disk-7\"><span class=\"toc-item-num\">7&nbsp;&nbsp;</span>Write data to disk</a></div><div class=\"lev2 toc-item\"><a href=\"#Limit-time-range\" data-toc-modified-id=\"Limit-time-range-71\"><span class=\"toc-item-num\">7.1&nbsp;&nbsp;</span>Limit time range</a></div><div class=\"lev2 toc-item\"><a href=\"#Different-shapes\" data-toc-modified-id=\"Different-shapes-72\"><span class=\"toc-item-num\">7.2&nbsp;&nbsp;</span>Different shapes</a></div><div class=\"lev2 toc-item\"><a href=\"#Write-to-SQL-database\" data-toc-modified-id=\"Write-to-SQL-database-73\"><span class=\"toc-item-num\">7.3&nbsp;&nbsp;</span>Write to SQL-database</a></div><div class=\"lev2 toc-item\"><a href=\"#Write-to-Excel\" data-toc-modified-id=\"Write-to-Excel-74\"><span class=\"toc-item-num\">7.4&nbsp;&nbsp;</span>Write to Excel</a></div><div class=\"lev2 toc-item\"><a href=\"#Write-to-CSV\" data-toc-modified-id=\"Write-to-CSV-75\"><span class=\"toc-item-num\">7.5&nbsp;&nbsp;</span>Write to CSV</a></div><div class=\"lev2 toc-item\"><a href=\"#Write-checksums.txt\" data-toc-modified-id=\"Write-checksums.txt-76\"><span class=\"toc-item-num\">7.6&nbsp;&nbsp;</span>Write checksums.txt</a></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "bba7a58e-f3b2-4d3b-9617-d734c369084f"
    }
   },
   "source": [
    "# Introductory Notes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This Notebook handles missing data, performs calculations and aggragations and creates the output files."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "ba6b62da-6cee-476b-a563-c945f3fd0f79"
    }
   },
   "source": [
    "# Settings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set version number and recent changes\n",
    "Executing this script till the end will create a new version of the data package.\n",
    "The Version number specifies the local directory for the data <br>\n",
    "We include a note on what has been changed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:52.131704Z",
     "start_time": "2017-07-09T19:48:52.123694Z"
    },
    "collapsed": true,
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "version = '2017-07-09'\n",
    "changes = '''include ENTSO E transparency, RTE, APG'''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "2b838df4-f987-4ae4-a132-9c898e3ffab1"
    }
   },
   "source": [
    "## Import Python libraries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "d4e82608-1ad1-4c51-8285-929a5a92c5b6"
    }
   },
   "source": [
    "This section: load libraries and set up a log.\n",
    "\n",
    "Note that the download module makes use of the [pycountry](https://pypi.python.org/pypi/pycountry) library that is not part of Anaconda. Install it with with `pip install pycountry` from the command line."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:56.087760Z",
     "start_time": "2017-07-09T19:48:52.136706Z"
    },
    "collapsed": true,
    "init_cell": true,
    "nbpresent": {
     "id": "c0035fc6-ff1d-44d8-a3fd-b4c08f53be71"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Python modules\n",
    "from datetime import datetime, date, timedelta, time\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import logging\n",
    "import json\n",
    "import sqlite3\n",
    "import yaml\n",
    "import itertools\n",
    "import os\n",
    "import pytz\n",
    "import hashlib\n",
    "from shutil import copyfile\n",
    "\n",
    "# Skripts from time-series repository\n",
    "from timeseries_scripts.read import read\n",
    "from timeseries_scripts.download import download\n",
    "from timeseries_scripts.imputation import find_nan\n",
    "from timeseries_scripts.imputation import resample_markers\n",
    "from timeseries_scripts.make_json import make_json\n",
    "\n",
    "# Reload modules with execution of any code, to avoid having to restart\n",
    "# the kernel after editing timeseries_scripts\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:56.277900Z",
     "start_time": "2017-07-09T19:48:56.090762Z"
    },
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "# make sure the working directory is this file's directory\n",
    "try:\n",
    "    os.chdir(home_path)\n",
    "except NameError:\n",
    "    home_path = os.path.realpath('.')\n",
    "\n",
    "sources_yaml_path = os.path.join(home_path, 'input', 'sources.yml')\n",
    "areas_csv_path = os.path.join(home_path, 'input', 'areas.csv')\n",
    "data_path = os.path.join('E:', os.sep, 'time series', version, 'original_data')\n",
    "out_path = os.path.join('E:', os.sep, 'time series', version)\n",
    "temp_path = os.path.join('E:', os.sep, 'time series', 'temp')\n",
    "os.makedirs(out_path, exist_ok=True)\n",
    "os.makedirs(temp_path, exist_ok=True)\n",
    "# change to temp directory\n",
    "os.chdir(temp_path)\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-01T21:51:29.732865Z",
     "start_time": "2017-07-01T21:51:29.567747Z"
    }
   },
   "source": [
    "## Set up a log"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:10:00.357871Z",
     "start_time": "2017-07-09T19:10:00.197757Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "FORMAT = '%(asctime)s %(levelname)s %(message)s'\n",
    "DATEFORMAT = '%Y-%m-%d %H:%M:%S'\n",
    "formatter = logging.Formatter(fmt=FORMAT, datefmt=DATEFORMAT)\n",
    "logging.basicConfig(level=logging.INFO,\n",
    "                    format=FORMAT,\n",
    "                    datefmt=DATEFORMAT)\n",
    "\n",
    "logfile = logging.FileHandler('log1.log')\n",
    "logfile.setFormatter(formatter)\n",
    "logfile.setLevel(logging.INFO)\n",
    "logger = logging.getLogger(__name__)\n",
    "logger.addHandler(logfile)\n",
    "\n",
    "logger.setLevel(logging.INFO)\n",
    "# For more detailed logging messages, replace 'INFO' with 'DEBUG'\n",
    "# (May slow down computation)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "8ed60fdb-284d-43db-b802-8e5c405b8e84"
    }
   },
   "source": [
    "## Select timerange"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This section: select the time range and the data sources for download and read. Default: all data sources implemented, full time range available.\n",
    "\n",
    "**Source parameters** are specified in [input/sources.yml](input/sources.yml), which describes, for each source, the datasets (such as wind and solar generation) alongside all the parameters necessary to execute the downloads.\n",
    "\n",
    "The option to perform downloading and reading of subsets is for testing only. To be able to run the script succesfully until the end, all sources have to be included, or otherwise the script will run into errors (i.e. the step where aggregate German timeseries are caculated requires data from all four German TSOs to be loaded)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to do this, specify the beginning and end of the interval for which to attempt the download.\n",
    "\n",
    "Type `None` to download all available data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:56.468399Z",
     "start_time": "2017-07-09T19:48:56.288914Z"
    },
    "collapsed": true,
    "init_cell": true,
    "nbpresent": {
     "id": "f3008d47-ec89-40d0-85ee-776d110f3bb4"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "start_from_user = date(2006, 1, 1)\n",
    "end_from_user = date(2017, 5, 31)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select download source"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Instead of downloading from the sources, the complete raw data can be downloaded as a zip file from the OPSD Server. Advantages are:\n",
    "- much faster download\n",
    "- back up of raw data in case it is deleted from the server at the original source\n",
    "\n",
    "In order to do this, specify an archive version to use the raw data from that version that has been cached on the OPSD server as input. All data from that version will be downloaded - timerange and subset will be ignored.\n",
    "\n",
    "Type `None` to download directly from the original sources."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:56.624511Z",
     "start_time": "2017-07-09T19:48:56.471401Z"
    },
    "collapsed": true,
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "archive_version = None  # i.e. '2016-07-14'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select subset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Optionally, specify a subset to download/read.<br>\n",
    "The next cell prints the available sources and datasets.<br>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T00:32:14.642791Z",
     "start_time": "2017-07-09T00:32:14.316106Z"
    },
    "collapsed": true,
    "format": "row",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "with open(sources_yaml_path, 'r') as f:\n",
    "    sources = yaml.load(f.read())\n",
    "for k, v in sources.items():\n",
    "    print(yaml.dump({k: list(v.keys())}, default_flow_style=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copy from its output and paste to following cell to get the right format.<br>\n",
    "Type `subset = None` to include all data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:56.800635Z",
     "start_time": "2017-07-09T19:48:56.629514Z"
    },
    "collapsed": true,
    "init_cell": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "subset = yaml.load('''\n",
    "insert_source_here:\n",
    "- insert_dataset1_from_that_source_here\n",
    "- insert_dataset2_here\n",
    "more_sources:\n",
    "- more_data_sets\n",
    "''')\n",
    "\n",
    "# need to exclude Elia data due to unclear copyright situation\n",
    "exclude = yaml.load('''\n",
    "- Elia\n",
    "''')  \n",
    "subset = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:56.993228Z",
     "start_time": "2017-07-09T19:48:56.805639Z"
    },
    "collapsed": true,
    "init_cell": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "subset = yaml.load('''\n",
    "PSE:\n",
    "- wind\n",
    "''')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now eliminate sources and variables not in subset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:57.263419Z",
     "start_time": "2017-07-09T19:48:56.998234Z"
    },
    "collapsed": true,
    "init_cell": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "with open(sources_yaml_path, 'r') as f:\n",
    "    sources = yaml.load(f.read())\n",
    "if subset:  # eliminate sources and variables not in subset\n",
    "    sources = {source_name: {k: v\n",
    "                             for k, v in sources[source_name].items()\n",
    "                             if k in variable_list}\n",
    "               for source_name, variable_list in subset.items()}\n",
    "if exclude:  # eliminate sources and variables in exclude\n",
    "    sources = {source_name: variable_dict\n",
    "               for source_name, variable_dict in sources.items()\n",
    "               if not source_name in exclude}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This section: download data. Takes about 1 hour to run for the complete data set (`subset=None`).\n",
    "\n",
    "First, a data directory is created on your local computer. Then, download parameters for each data source are defined, including the URL. These parameters are then turned into a YAML-string. Finally, the download is executed file by file.\n",
    "\n",
    "Each file is saved under it's original filename. Note that the original file names are often not self-explanatory (called \"data\" or \"January\"). The files content is revealed by its place in the directory structure."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "e08368cb-021f-453a-b201-a8d48bc8e4c4"
    }
   },
   "source": [
    "## Automatic download (for all sources except Energinet.dk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:42:01.152823Z",
     "start_time": "2017-07-09T19:41:48.442656Z"
    },
    "collapsed": true,
    "format": "row",
    "nbpresent": {
     "id": "0c1eb987-6d5f-4e3d-9248-df80b9f37a49"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "download(sources, data_path,\n",
    "         archive_version=archive_version,\n",
    "         start_from_user=start_from_user,\n",
    "         end_from_user=end_from_user)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "bae7810a-d4af-4021-a9ab-c4b772a2bd53"
    }
   },
   "source": [
    "## Manual download (Energinet.dk)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "a69f8271-e1fc-4e64-814c-4210fb80c006"
    }
   },
   "source": [
    "[Energinet.dk](https://en.energinet.dk/Electricity/Energy-data) data needs to be downloaded manually [here](http://osp.energinet.dk/_layouts/Markedsdata/framework/integrations/markedsdatatemplate.aspx). The website seems as of 2017-07 to be only available in Danish.\n",
    "\n",
    "\n",
    "Check The Boxes as specified below, then press the \"Get extract\"-button at the end of the page.\n",
    "You will receive a file `Market Data.xls` of about 50 MB. Open the file in Excel. There will be a warning from Excel saying that file extension and content are in conflict. Select \"open anyways\" and and save the file as `.xlsx`. This will compress the file by ~80%, resulting in faster processing afterwards. \n",
    "\n",
    "In order to be found by the read-function, place the downloaded file in the following subdirectory:<br>\n",
    "` \\original_data\\Energinet.dk\\prices_wind_solar\\2005-01-01_2017-12-31\\ ` <br>\n",
    "\n",
    "**Boxes to check:**\n",
    "\n",
    "- Period\n",
    "    - Get data from\n",
    "        - 01-01-2005\n",
    "    - To\n",
    "        - Today\n",
    "    - all months\n",
    "- Data columns\n",
    "    - Elspot Price, Currency Code/MWh\n",
    "        - DK-West\n",
    "        - DK-East\n",
    "        - Norway\n",
    "        - Sweden (SE)\n",
    "        - Sweden (SE3)\n",
    "        - Sweden (SE4)\n",
    "        - DE European Power Exchange\n",
    "    - Production and consumption, MWh/h\n",
    "        - DK-West: Wind power production\n",
    "        - DK-West: Solar cell production (estimated)\n",
    "        - DK-East: Wind power production\n",
    "        - DK-East: Solar cell production (estimated)\n",
    "        - DK: Wind power production (onshore)\n",
    "        - DK: Wind power production (offshore)\n",
    "- Data format\n",
    "    - Currency code\n",
    "        - EUR\n",
    "    - Decimal format\n",
    "        - English number Format (period as decimal separator)\n",
    "    - Date format\n",
    "        - Other date format(YYYY-MM-DD)\n",
    "    - Recieve to\n",
    "        - Excel\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "d0b353e7-179d-4556-bdd2-270192c830fb"
    }
   },
   "source": [
    "# Read"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "d223207e-2b0e-49ec-8bf6-af2969ee5b28"
    }
   },
   "source": [
    "This section: Read each downloaded file into a pandas-DataFrame and merge data from different sources if it has the same time resolution. Takes ~15 minutes to run."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "1300cbb2-efde-4844-b08c-fed092023e38"
    }
   },
   "source": [
    "Set the title of the rows at the top of the data used to store metadata internally. The order of this list determines the order of the levels in the resulting output."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:57.375497Z",
     "start_time": "2017-07-09T19:48:57.266421Z"
    },
    "collapsed": true,
    "format": "row",
    "init_cell": true,
    "nbpresent": {
     "id": "4dc92cc3-c01d-4c83-9252-80958edbe0f9"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "headers = ['region', 'variable', 'attribute', 'source', 'web', 'unit']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read a prepared table containing meta data on the geographical areas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:14:07.805299Z",
     "start_time": "2017-07-09T19:14:07.667325Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "areas = pd.read_csv(areas_csv_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "View the areas table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "areas.loc[areas['OPSD area'].notnull(),:'EIC'].fillna('')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading loop"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Loop through sources and variables to do the reading.\n",
    "First read the originla CSV, Excel etc. files into pandas DataFrames."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:45:49.732239Z",
     "start_time": "2017-07-09T19:45:40.642822Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "areas = pd.read_csv(areas_csv_path)\n",
    "\n",
    "# For each source in the source dictionary\n",
    "for source_name, source_dict in sources.items():\n",
    "    # For each variable from source_name\n",
    "    for variable_name, param_dict in source_dict.items():\n",
    "#        variable_dir = os.path.join(data_path, source_name, variable_name)\n",
    "        res_list = param_dict['resolution']\n",
    "        url = param_dict['web']\n",
    "        for res_key in res_list:\n",
    "            df = read(data_path, areas, source_name, variable_name,\n",
    "                      url, res_key, headers,\n",
    "                      start_from_user=start_from_user,\n",
    "                      end_from_user=end_from_user)\n",
    "\n",
    "            os.makedirs(res_key, exist_ok=True)\n",
    "            filename = '_'.join([source_name, variable_name]) + '.pickle'\n",
    "            df.to_pickle(os.path.join(res_key, filename))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then combine the DataFrames that have the same temporal resolution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:08:14.883502Z",
     "start_time": "2017-07-09T02:07:56.920886Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Create a dictionary of empty DataFrames to be populated with data\n",
    "data_sets = {'15min': pd.DataFrame(), '30min': pd.DataFrame(), '60min': pd.DataFrame()}\n",
    "entso_e = {'15min': pd.DataFrame(), '30min': pd.DataFrame(), '60min': pd.DataFrame()}\n",
    "for res_key in data_sets.keys():\n",
    "    for filename in os.listdir(res_key):\n",
    "        source_name = filename.split('_')[0]\n",
    "        if subset and not source_name in subset.keys():\n",
    "            continue\n",
    "        logger.info(filename)\n",
    "        df_portion = pd.read_pickle(os.path.join(res_key, filename))\n",
    "\n",
    "        if source_name == 'ENTSO-E Transparency FTP':\n",
    "            dfs = entso_e\n",
    "        else:\n",
    "            dfs = data_sets\n",
    "\n",
    "        if dfs[res_key].empty:\n",
    "            dfs[res_key] = df_portion\n",
    "        elif not df_portion.empty:\n",
    "            dfs[res_key] = dfs[res_key].combine_first(df_portion)\n",
    "        else:\n",
    "            logger.warning(filename + 'was empty')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-08T18:43:15.052777Z",
     "start_time": "2017-07-08T18:43:14.920689Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for res_key, df in data_sets.items():\n",
    "    logger.info(res_key + ': %s', df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T00:50:41.751553Z",
     "start_time": "2017-07-09T00:50:41.636472Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for res_key, df in entso_e.items():\n",
    "    logger.info(res_key + ': %s', df.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "be24bc5f-e281-43bf-853a-e7130d20d2e8"
    }
   },
   "source": [
    "Display some rows of the dataframes to get a first impression of the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T00:52:41.269380Z",
     "start_time": "2017-07-09T00:52:41.041221Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_sets['60min'].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save raw data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "019c8ae9-8934-4074-adfd-278acd212152"
    }
   },
   "source": [
    "Save the DataFrames created by the read function to disk. This way you have the raw data to fall back to if something goes wrong in the ramainder of this notebook without having to repeat the previos steps."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:08:41.167658Z",
     "start_time": "2017-07-09T02:08:39.913727Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data_sets['15min'].to_pickle('raw_data_15.pickle')\n",
    "data_sets['30min'].to_pickle('raw_data_30.pickle')\n",
    "data_sets['60min'].to_pickle('raw_data_60.pickle')\n",
    "entso_e['15min'].to_pickle('raw_entso_e_15.pickle')\n",
    "entso_e['30min'].to_pickle('raw_entso_e_30.pickle')\n",
    "entso_e['60min'].to_pickle('raw_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load the DataFrames saved above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T01:01:42.171699Z",
     "start_time": "2017-07-09T01:01:41.141505Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data_sets = {}\n",
    "data_sets['15min'] = pd.read_pickle('raw_data_15.pickle')\n",
    "data_sets['30min'] = pd.read_pickle('raw_data_30.pickle')\n",
    "data_sets['60min'] = pd.read_pickle('raw_data_60.pickle')\n",
    "entso_e['15min'] = pd.read_pickle('raw_entso_e_15.pickle')\n",
    "entso_e['30min'] = pd.read_pickle('raw_entso_e_30.pickle')\n",
    "entso_e['60min'] = pd.read_pickle('raw_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "026c6ad8-0422-4887-9361-8e45ae33e0c6"
    }
   },
   "source": [
    "# Processing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "c446157a-7872-491b-a9c4-0336b87568a6"
    }
   },
   "source": [
    "This section: missing data handling, aggregation of sub-national to national data, aggregate 15'-data to 60'-resolution. Takes 30 minutes to run."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "0c13e5eb-d8e3-425d-982e-2a25c364cdc5"
    }
   },
   "source": [
    "## Missing data handling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Interpolation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "81d4d772-d3c7-4e07-a0a7-49afb3bdf8f6"
    }
   },
   "source": [
    "Patch missing data. At this stage, only small gaps (up to 2 hours) are filled by linear interpolation. This catched most of the missing data due to daylight savings time transitions, while leaving bigger gaps untouched\n",
    "\n",
    "The exact locations of missing data are stored in the `nan_table` DataFrames.\n",
    "\n",
    "Where data has been interpolated, it is marked in a new column `comment`. For eaxample the comment `solar_DE-transnetbw_generation;` means that in the original data, there is a gap in the solar generation timeseries from TransnetBW in the time period where the marker appears."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "533cd53f-eb8f-4699-bbf7-72ad1cf90325"
    }
   },
   "source": [
    "Patch the datasets and display the location of missing Data in the original data. Takes ~5 minutes to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:11:04.264138Z",
     "start_time": "2017-07-09T02:08:49.649554Z"
    },
    "collapsed": true,
    "format": "row",
    "nbpresent": {
     "id": "33602355-28cb-4d5c-b97e-fe59b9b48883"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "nan_tables = {}\n",
    "for res_key, df in data_sets.items():\n",
    "    data_sets[res_key], nan_tables[res_key] = find_nan(\n",
    "        df, res_key, headers, patch=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:12:58.589866Z",
     "start_time": "2017-07-09T02:11:04.272143Z"
    },
    "collapsed": true,
    "format": "row",
    "nbpresent": {
     "id": "33602355-28cb-4d5c-b97e-fe59b9b48883"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for res_key, df in entso_e.items():\n",
    "    entso_e[res_key], nan_tables[res_key + ' ENTSO-E'] = find_nan(\n",
    "        df, res_key, headers, patch=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "345c31ec-fa0b-4859-a930-8b7c20f1e1bf"
    }
   },
   "source": [
    "Execute this to see an example of where the data has been patched."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:16:44.142786Z",
     "start_time": "2017-07-09T02:16:42.776818Z"
    },
    "collapsed": true,
    "format": "row",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data_sets['15min'][data_sets['15min']['interpolated_values'].notnull()].tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "4c5c4893-5098-47df-8e68-51b13f5fd484"
    }
   },
   "source": [
    "Display the table of regions of missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:16:56.054708Z",
     "start_time": "2017-07-09T02:16:55.450366Z"
    },
    "collapsed": true,
    "nbpresent": {
     "id": "b5207ca6-4a72-40b6-8afd-3123f66ae323"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "nan_tables['60min']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can export the NaN-tables to Excel in order to inspect where there are NaNs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:18:07.788379Z",
     "start_time": "2017-07-09T02:17:06.188757Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "writer = pd.ExcelWriter('NaN_table.xlsx')\n",
    "for res_key, df in nan_tables.items():\n",
    "    df.to_excel(writer, res_key)\n",
    "writer.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save/Load the patched data sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:18:11.220848Z",
     "start_time": "2017-07-09T02:18:07.794383Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data_sets['15min'].to_pickle('patched_15.pickle')\n",
    "data_sets['30min'].to_pickle('patched_30.pickle')\n",
    "data_sets['60min'].to_pickle('patched_60.pickle')\n",
    "entso_e['15min'].to_pickle('patched_entso_e_15.pickle')\n",
    "entso_e['30min'].to_pickle('patched_entso_e_30.pickle')\n",
    "entso_e['60min'].to_pickle('patched_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#data_sets = {}\n",
    "data_sets['15min'] = pd.read_pickle('patched_15.pickle')\n",
    "data_sets['30min'] = pd.read_pickle('patched_30.pickle')\n",
    "data_sets['60min'] = pd.read_pickle('patched_60.pickle')\n",
    "entso_e['15min'] = pd.read_pickle('patched_entso_e_15.pickle')\n",
    "entso_e['30min'] = pd.read_pickle('patched_entso_e_30.pickle')\n",
    "entso_e['60min'] = pd.read_pickle('patched_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "abd61a95-68e4-4bf2-9f1f-ef010b4cd288"
    }
   },
   "source": [
    "## Country specific calculations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "0a2cc638-d6f5-4f7f-a044-fb61d20cf4ad"
    }
   },
   "source": [
    "### Calculate onshore wind generation for German TSOs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For 50 Hertz, it is already in the data.\n",
    "For TenneT, it is calculated by substracting offshore from total generation.\n",
    "For Amprion and TransnetBW, onshore wind generation is just total wind generation.\n",
    "Takes <1 second to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:18:11.949365Z",
     "start_time": "2017-07-09T02:18:11.228853Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Some of the following operations require the Dataframes to be lexsorted in\n",
    "# the columns\n",
    "for res_key, df in data_sets.items():\n",
    "    df.sort_index(axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:19:15.814664Z",
     "start_time": "2017-07-09T02:19:14.342037Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for area, source in zip(['DE_amprion', 'DE_tennet', 'DE_transnetbw'],\n",
    "                        ['Amprion', 'TenneT', 'TransnetBW']):\n",
    "    new_col_header = {\n",
    "        'variable': 'wind_onshore',\n",
    "        'region': '{tso}',\n",
    "        'attribute': 'generation',\n",
    "        'source': 'own calculation',\n",
    "        'web': '',\n",
    "        'unit': 'MW'\n",
    "    }\n",
    "\n",
    "    if area == 'DE_tennet':\n",
    "        colname = ('DE_tennet', 'wind_offshore', 'generation', 'TenneT')\n",
    "        offshore = data_sets['15min'].loc[:, colname]\n",
    "    else:\n",
    "        offshore = 0\n",
    "\n",
    "    data_sets['15min'][\n",
    "        tuple(new_col_header[level].format(tso=area) for level in headers)\n",
    "    ] = (data_sets['15min'][(area, 'wind', 'generation', source)] - offshore)\n",
    "\n",
    "    # Sort again\n",
    "    data_sets['15min'].sort_index(axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate aggregate wind capacity for germany (on + offshore)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apart from being interesting on it's own, this is also required to calculate an aggregated wind-profile for Germany"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:19:18.688249Z",
     "start_time": "2017-07-09T02:19:18.162865Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "new_col_header = {\n",
    "    'variable': 'wind',\n",
    "    'region': 'DE',\n",
    "    'attribute': 'capacity',\n",
    "    'source': 'BNetzA and Netztransparenz.de',\n",
    "    'web': 'http://data.open-power-system-data.org/renewable_power_plants',\n",
    "    'unit': 'MW'\n",
    "}\n",
    "new_col_header = tuple(new_col_header[level] for level in headers)\n",
    "\n",
    "data_sets['15min'][new_col_header] = (\n",
    "    data_sets['15min']\n",
    "    .loc[:, ('DE', ['wind_onshore', 'wind_offshore'], 'capacity')]\n",
    "    .sum(axis=1, skipna=False))\n",
    "\n",
    "# Sort again\n",
    "data_sets['15min'].sort_index(axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "cbcd1b7c-6c9c-4d1a-ab70-28276b421805"
    }
   },
   "source": [
    "### Aggregate German data from individual TSOs and calculate availabilities/profiles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "9ab0116d-402b-444b-acf2-925388a9886b"
    }
   },
   "source": [
    "The wind and solar in-feed data for the 4 German balancing areas is summed up and stored in in new columns, which are then used to calculate profiles, that is, the share of wind/solar capacity producing at a given time. The column headers are created in the fashion introduced in the read script. Takes 5 seconds to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:19:34.406323Z",
     "start_time": "2017-07-09T02:19:25.631537Z"
    },
    "collapsed": true,
    "nbpresent": {
     "id": "26a83b1c-682d-43ec-bb70-12a096089508"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for tech in ['solar', 'wind', 'wind_onshore', 'wind_offshore']:\n",
    "    for attribute in ['generation']:  # we could also include 'forecast'\n",
    "        sum_col = data_sets['15min'].loc(axis=1)[(\n",
    "            ['DE_50hertz', 'DE_amprion', 'DE_tennet', 'DE_transnetbw'],\n",
    "            tech, attribute)].sum(axis=1, skipna=False).to_frame()\n",
    "\n",
    "        # Create a new MultiIndex and append aggregated data to the dataset\n",
    "        new_col_header = {\n",
    "            'variable': '{tech}',\n",
    "            'region': 'DE',\n",
    "            'attribute': '{attribute}',\n",
    "            'source': 'own calculation',\n",
    "            'web': '',\n",
    "            'unit': 'MW'\n",
    "        }\n",
    "        tuples = [tuple(new_col_header[level].format(\n",
    "            tech=tech, attribute=attribute) for level in headers)]\n",
    "        sum_col.columns = pd.MultiIndex.from_tuples(tuples, names=headers)\n",
    "        data_sets['15min'] = data_sets['15min'].combine_first(sum_col)\n",
    "\n",
    "        if not attribute == 'generation':\n",
    "            continue\n",
    "        \n",
    "        # Calculate the profile column\n",
    "        profile_col = (sum_col.values * 100 /\n",
    "                       data_sets['15min']['DE', tech, 'capacity'])\n",
    "        \n",
    "        # Create a new MultiIndex and append aggregated data to the dataset\n",
    "        new_col_header = {\n",
    "            'variable': '{tech}',\n",
    "            'region': 'DE',\n",
    "            'attribute': 'profile',\n",
    "            'source': 'own calculation',\n",
    "            'web': '',\n",
    "            'unit': 'percent'\n",
    "        }\n",
    "        tuples = [tuple(new_col_header[level].format(tech=tech)\n",
    "                        for level in headers)]\n",
    "        profile_col.columns = pd.MultiIndex.from_tuples(tuples, names=headers)\n",
    "        data_sets['15min'] = data_sets['15min'].combine_first(profile_col)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "4e311f14-5124-47fd-9f94-0a866e9a6f71"
    }
   },
   "source": [
    "## Create hourly data from 15' data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "882bf163-9e93-44a4-9c51-a3ce0c32b4f4"
    }
   },
   "source": [
    "Some data comes in 15-minute intervals (i.e. German renewable generation), other in 60-minutes (i.e. load data from ENTSO-E and Prices). We resample the 15-minute data to hourly resolution and append it to the 60-minutes dataset.\n",
    "\n",
    "The marker column is resampled separately in such a way that all information on where data has been interpolated is preserved.\n",
    "\n",
    "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 15 seconds to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:19:53.307021Z",
     "start_time": "2017-07-09T02:19:34.414330Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Condense marker column from 15 to 60 minutes resolutution\n",
    "marker_15 = data_sets['15min']['interpolated_values'].groupby(\n",
    "    pd.Grouper(freq='60Min', closed='left', label='left')\n",
    "    ).agg(resample_markers)\n",
    "marker_15 = marker_15.reindex(data_sets['60min'].index)\n",
    "\n",
    "# Glue condensed 15 min marker onto 60 min marker\n",
    "marker_60 = data_sets['60min']['interpolated_values']\n",
    "both_markers = marker_15.notnull() & marker_60.notnull()\n",
    "only_15 = marker_15.notnull() & marker_60.isnull()\n",
    "data_sets['60min'].loc[both_markers, 'interpolated_values'] = (\n",
    "    marker_60.str.cat(others=marker_15, sep=' | '))\n",
    "data_sets['60min'].loc[only_15, 'interpolated_values'] = marker_15\n",
    "\n",
    "resampled = data_sets['15min'].resample('H').mean()\n",
    "data_sets['60min'] = data_sets['60min'].combine_first(resampled)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fill columns not retrieved directly from TSO webites with  ENTSO-E Transparency data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:20:06.911096Z",
     "start_time": "2017-07-09T02:19:53.316027Z"
    },
    "collapsed": true,
    "nbpresent": {
     "id": "c19e3eb9-feca-4e94-9035-580aa07189ba"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for res_key, df in entso_e.items():\n",
    "    resampled = (df.drop('DE_AT_LU', axis=1, errors='ignore')\n",
    "                 .resample('H').mean())\n",
    "    entso_e['60min'] = entso_e['60min'].combine_first(resampled)\n",
    "    \n",
    "    # combine with TSO data\n",
    "    if data_sets[res_key].empty:\n",
    "        data_sets[res_key] = df\n",
    "    else:\n",
    "        data_cols = data_sets[res_key].columns.droplevel(['source', 'web', 'unit'])\n",
    "        add_cols = [col for col in df.columns if not col[:3] in data_cols]\n",
    "        add_cols = pd.MultiIndex.from_tuples(add_cols, names=headers)\n",
    "        data_sets[res_key] = data_sets[res_key].combine_first(df[add_cols])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "dea911fb-326f-46a6-a009-57af147d4be4"
    }
   },
   "source": [
    "## Insert a column with Central European (Summer-)time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The index column of th data sets defines the start of the timeperiod represented by each row of that data set in **UTC** time. We include an additional column for the **CE(S)T** Central European (Summer-) Time, as this might help aligning the output data with other data sources."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T19:48:57.488581Z",
     "start_time": "2017-07-09T19:48:57.378501Z"
    },
    "collapsed": true,
    "init_cell": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "info_cols = {'utc': 'utc_timestamp',\n",
    "             'cet': 'cet_cest_timestamp',\n",
    "             'marker': 'interpolated_values'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:20:08.083928Z",
     "start_time": "2017-07-09T02:20:07.760699Z"
    },
    "collapsed": true,
    "nbpresent": {
     "id": "b1550779-53cc-498d-980b-7aa253974c91"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for res_key, df in data_sets.items():\n",
    "    if df.empty:\n",
    "        continue\n",
    "    df.index.rename(info_cols['utc'], inplace=True)\n",
    "    df.insert(0, info_cols['cet'],\n",
    "              df.index.tz_localize('UTC').tz_convert('Europe/Brussels'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a final savepoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:20:12.060036Z",
     "start_time": "2017-07-09T02:20:08.091934Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data_sets['15min'].to_pickle('final_15.pickle')\n",
    "data_sets['30min'].to_pickle('final_30.pickle')\n",
    "data_sets['60min'].to_pickle('final_60.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:20:17.245429Z",
     "start_time": "2017-07-09T02:20:12.066039Z"
    },
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data_sets = {}\n",
    "data_sets['15min'] = pd.read_pickle('final_15.pickle')\n",
    "data_sets['30min'] = pd.read_pickle('final_30.pickle')\n",
    "data_sets['60min'] = pd.read_pickle('final_60.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "4e625ea7-c448-45fe-85b2-026157ad24c0"
    }
   },
   "source": [
    "# Create metadata"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "38b6a454-31c4-4112-971f-da8271131d54"
    }
   },
   "source": [
    "This section: create the metadata, both general and column-specific. All metadata we be stored as a JSON file. Takes <1s to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:20:17.943925Z",
     "start_time": "2017-07-09T02:20:17.272449Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# change to out_path directory\n",
    "os.chdir(out_path)\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:20:24.555148Z",
     "start_time": "2017-07-09T02:20:17.948927Z"
    },
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "make_json(data_sets, info_cols, version, changes, headers, areas)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "fa919796-a7f6-4556-aeed-181ddc6028ac"
    }
   },
   "source": [
    "# Write data to disk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "1b3c9199-ce14-4487-939d-656f60c14df3"
    }
   },
   "source": [
    "This section: Save as [Data Package](http://data.okfn.org/doc/tabular-data-package) (data in CSV, metadata in JSON file). All files are saved in the directory of this notebook. Alternative file formats (SQL, XLSX) are also exported. Takes about 1 hour to run."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Limit time range\n",
    "Cut off the data outside of [start_from_user:end_from_user]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:20:25.125551Z",
     "start_time": "2017-07-09T02:20:24.561152Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for res_key, df in data_sets.items():\n",
    "    # First, convert userinput to UTC time to conform with data_set.index\n",
    "    if start_from_user:\n",
    "        start_from_user = (\n",
    "            pytz.timezone('Europe/Brussels')\n",
    "            .localize(datetime.combine(start_from_user, time()))\n",
    "            .astimezone(pytz.timezone('UTC')))\n",
    "    if end_from_user:\n",
    "        end_from_user = (\n",
    "            pytz.timezone('Europe/Brussels')\n",
    "            .localize(datetime.combine(end_from_user, time()))\n",
    "            .astimezone(pytz.timezone('UTC'))\n",
    "            # appropriate offset to inlude the end of period\n",
    "            + timedelta(days=1, minutes=-int(res_key[:2])))\n",
    "    # Then cut off the data_set\n",
    "    data_sets[res_key] = df.loc[start_from_user:end_from_user, :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "47c1efa2-d93f-4d13-81d7-8f64dadeff3f"
    }
   },
   "source": [
    "## Different shapes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "a753ac43-a0f4-44bc-a89d-1ccaaf48289a"
    }
   },
   "source": [
    "Data are provided in three different \"shapes\": \n",
    "- SingleIndex (easy to read for humans, compatible with datapackage standard, small file size)\n",
    "  - Fileformat: CSV, SQLite\n",
    "- MultiIndex (easy to read into GAMS, not compatible with datapackage standard, small file size)\n",
    "  - Fileformat: CSV, Excel\n",
    "- Stacked (compatible with data package standard, large file size, many rows, too many for Excel) \n",
    "  - Fileformat: CSV\n",
    "\n",
    "The different shapes need to be created internally befor they can be saved to files. Takes about 1 minute to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:21:45.700989Z",
     "start_time": "2017-07-09T02:20:25.140561Z"
    },
    "collapsed": true,
    "nbpresent": {
     "id": "769225c6-31f5-4db8-8d91-32a3f983489c"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data_sets_singleindex = {}\n",
    "data_sets_multiindex = {}\n",
    "data_sets_stacked = {}\n",
    "for res_key, df in data_sets.items():\n",
    "    if df.empty:\n",
    "        continue\n",
    "\n",
    "    for col_name, col in df.iteritems():\n",
    "        if not (col_name[0] in info_cols.values() or\n",
    "                col_name[2] == 'profile'):\n",
    "            df[col_name] = col.round(0)\n",
    "\n",
    "    # MultIndex\n",
    "    data_sets_multiindex[res_key + '_multiindex'] = df\n",
    "    \n",
    "    # SingleIndex\n",
    "    df_singleindex = df.copy()\n",
    "    # use first 3 levels of multiindex to create singleindex\n",
    "    df_singleindex.columns = [\n",
    "        col[0] if col[0] in info_cols.values()\n",
    "        else '_'.join([level for level in col[0:3] if not level == ''])\n",
    "        for col in df.columns.values]\n",
    "\n",
    "    data_sets_singleindex[res_key + '_singleindex'] = df_singleindex\n",
    "\n",
    "    # Stacked\n",
    "    stacked = df.copy()\n",
    "    stacked.drop(info_cols['cet'], axis=1, inplace=True)\n",
    "    stacked.columns = stacked.columns.droplevel(['source', 'web', 'unit'])\n",
    "    stacked = stacked.transpose().stack(dropna=True).to_frame(name='data')\n",
    "    data_sets_stacked[res_key + '_stacked'] = stacked"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "84f1822e-3aa6-42c4-a424-5dc5ab6fa56f"
    }
   },
   "source": [
    "## Write to SQL-database"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "3bb78fa9-5309-46b6-b945-68dcb654a567"
    }
   },
   "source": [
    "This file format is required for the filtering function on the OPSD website. This takes about 30 seconds to complete."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T02:24:14.291087Z",
     "start_time": "2017-07-09T02:21:45.749022Z"
    },
    "collapsed": true,
    "nbpresent": {
     "id": "fd35212c-ec5c-4fcf-9897-4608742d1bf8"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for res_key, df in data_sets_singleindex.items():\n",
    "    table = 'time_series_' + res_key\n",
    "    df = df.copy()\n",
    "    df.index = df.index.strftime('%Y-%m-%dT%H:%M:%SZ')\n",
    "    df[info_cols['cet']] = (\n",
    "        df[info_cols['cet']].dt.strftime('%Y-%m-%dT%H:%M:%S%z'))\n",
    "    df.to_sql(table, sqlite3.connect('time_series.sqlite'),\n",
    "              if_exists='replace', index_label=info_cols['utc'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "143b39aa-dd2e-4923-be56-bb6c4706837d"
    }
   },
   "source": [
    "## Write to Excel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "ef7763f5-9bbc-40b8-8cee-829131b40336"
    }
   },
   "source": [
    "Writing the full tables to Excel takes extremely long. As a workaround, only the first 5 rows are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T11:01:29.192741Z",
     "start_time": "2017-07-09T11:01:27.080351Z"
    },
    "collapsed": true,
    "nbpresent": {
     "id": "9e84c62a-7bd6-4319-89dd-409574dda234"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "writer = pd.ExcelWriter('time_series1.xlsx')\n",
    "for res_key, df in data_sets_multiindex.items():\n",
    "    df.head().to_excel(writer, res_key.split('_')[0], float_format='%.2f',\n",
    "                       merge_cells=True)\n",
    "    # merge_cells=False doesn't work properly with multiindex\n",
    "writer.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "454ee5f5-e8f1-4088-94e9-e846f48ee75b"
    }
   },
   "source": [
    "## Write to CSV"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "279e3306-6dea-454d-bec8-0c826509ecd1"
    }
   },
   "source": [
    "This takes about 10 minutes to complete."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T09:26:36.780830Z",
     "start_time": "2017-07-09T09:23:34.921226Z"
    },
    "collapsed": true,
    "nbpresent": {
     "id": "89449c49-608d-488d-8bc8-077c64bc26c7"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# itertoools.chain() allows iterating over multiple dicts at once\n",
    "for res_stacking_key, df in itertools.chain(\n",
    "#        data_sets_singleindex.items(),\n",
    "#        data_sets_multiindex.items(),\n",
    "        data_sets_stacked.items()\n",
    "    ):\n",
    "    # convert the format of the cet_cest-timestamp to ISO-8601\n",
    "    if not (res_stacking_key.split('_')[1] == 'stacked'\n",
    "            or type(df.iloc[0,0]) == str):\n",
    "        df.iloc[:,0] = df.iloc[:,0].dt.strftime('%Y-%m-%dT%H:%M:%S%z')\n",
    "    filename = 'time_series_' + res_stacking_key + '.csv'\n",
    "    df.to_csv(filename, float_format='%.2f',\n",
    "              date_format='%Y-%m-%dT%H:%M:%SZ')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write checksums.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We publish SHA-checksums for the outputfiles on GitHub to allow verifying the integrity of outputfiles on the OPSD server."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-07-09T12:16:54.760917Z",
     "start_time": "2017-07-09T12:16:46.394519Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_sha_hash(path, blocksize=65536):\n",
    "    sha_hasher = hashlib.sha256()\n",
    "    with open(path, 'rb') as f:\n",
    "        buffer = f.read(blocksize)\n",
    "        while len(buffer) > 0:\n",
    "            sha_hasher.update(buffer)\n",
    "            buffer = f.read(blocksize)\n",
    "        return sha_hasher.hexdigest()\n",
    "\n",
    "files = os.listdir(out_path)\n",
    "\n",
    "# Create checksums.txt in the output directory\n",
    "with open('checksums.txt', 'w') as f:\n",
    "    for file_name in files:\n",
    "        if file_name.split('.')[-1] in ['csv', 'sqlite', 'xlsx']:\n",
    "            file_hash = get_sha_hash(file_name)\n",
    "            f.write('{},{}\\n'.format(file_name, file_hash))\n",
    "\n",
    "# Copy the file to root directory from where it will be pushed to GitHub,\n",
    "# leaving a copy in the version directory for reference\n",
    "copyfile('checksums.txt', os.path.join(home_path, 'checksums.txt'))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "celltoolbar": "Initialization Cell",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  },
  "nbpresent": {
   "slides": {
    "f6b300bf-88b5-4dea-951e-c926a9ea8287": {
     "id": "f6b300bf-88b5-4dea-951e-c926a9ea8287",
     "prev": "f96dd4bc-93a6-4014-b85f-a43061cf5688",
     "regions": {
      "dc486e18-7547-4610-99c0-55dfb5553f62": {
       "attrs": {
        "height": 1,
        "width": 1,
        "x": 0,
        "y": 0
       },
       "content": {
        "cell": "c0035fc6-ff1d-44d8-a3fd-b4c08f53be71",
        "part": "source"
       },
       "id": "dc486e18-7547-4610-99c0-55dfb5553f62"
      }
     }
    },
    "f96dd4bc-93a6-4014-b85f-a43061cf5688": {
     "id": "f96dd4bc-93a6-4014-b85f-a43061cf5688",
     "prev": null,
     "regions": {
      "657c3ad3-2fcf-4c8e-a527-de3d0a46fa4e": {
       "attrs": {
        "height": 1,
        "width": 1,
        "x": 0,
        "y": 0
       },
       "content": {
        "cell": "1562965a-7d74-4c1c-8251-4d82847f294a",
        "part": "source"
       },
       "id": "657c3ad3-2fcf-4c8e-a527-de3d0a46fa4e"
      }
     }
    }
   },
   "themes": {}
  },
  "notify_time": "10",
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": false,
   "nav_menu": {
    "height": "512px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": true,
   "toc_position": {
    "height": "1238px",
    "left": "0px",
    "right": "1066px",
    "top": "106px",
    "width": "322px"
   },
   "toc_section_display": "block",
   "toc_window_display": true,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}