{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div style=\"width:100%; background-color: #D9EDF7; border: 1px solid #CFCFCF; text-align: left; padding: 10px;\">\n",
    "      <b>Time series: Processing Notebook</b>\n",
    "      <ul>\n",
    "        <li><a href=\"main.ipynb\">Main Notebook</a></li>\n",
    "        <li>Processing Notebook</li>\n",
    "      </ul>\n",
    "      <br>This Notebook is part of the <a href=\"http://data.open-power-system-data.org/time_series\">Time series Data Package</a> of <a href=\"http://open-power-system-data.org\">Open Power System Data</a>.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": "true"
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Introductory-Notes\" data-toc-modified-id=\"Introductory-Notes-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Introductory Notes</a></span></li><li><span><a href=\"#Settings\" data-toc-modified-id=\"Settings-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Settings</a></span><ul class=\"toc-item\"><li><span><a href=\"#Set-version-number-and-recent-changes\" data-toc-modified-id=\"Set-version-number-and-recent-changes-2.1\"><span class=\"toc-item-num\">2.1&nbsp;&nbsp;</span>Set version number and recent changes</a></span></li><li><span><a href=\"#Import-Python-libraries\" data-toc-modified-id=\"Import-Python-libraries-2.2\"><span class=\"toc-item-num\">2.2&nbsp;&nbsp;</span>Import Python libraries</a></span></li><li><span><a href=\"#Display-options\" data-toc-modified-id=\"Display-options-2.3\"><span class=\"toc-item-num\">2.3&nbsp;&nbsp;</span>Display options</a></span></li><li><span><a href=\"#Set-directories\" data-toc-modified-id=\"Set-directories-2.4\"><span class=\"toc-item-num\">2.4&nbsp;&nbsp;</span>Set directories</a></span></li><li><span><a href=\"#Chromedriver\" data-toc-modified-id=\"Chromedriver-2.5\"><span class=\"toc-item-num\">2.5&nbsp;&nbsp;</span>Chromedriver</a></span></li><li><span><a href=\"#Set-up-a-log\" data-toc-modified-id=\"Set-up-a-log-2.6\"><span class=\"toc-item-num\">2.6&nbsp;&nbsp;</span>Set up a log</a></span></li><li><span><a href=\"#Select-timerange\" data-toc-modified-id=\"Select-timerange-2.7\"><span class=\"toc-item-num\">2.7&nbsp;&nbsp;</span>Select timerange</a></span></li><li><span><a href=\"#Select-download-source\" data-toc-modified-id=\"Select-download-source-2.8\"><span class=\"toc-item-num\">2.8&nbsp;&nbsp;</span>Select download source</a></span></li><li><span><a href=\"#Select-subset\" data-toc-modified-id=\"Select-subset-2.9\"><span class=\"toc-item-num\">2.9&nbsp;&nbsp;</span>Select subset</a></span></li></ul></li><li><span><a href=\"#Download\" data-toc-modified-id=\"Download-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Download</a></span><ul class=\"toc-item\"><li><span><a href=\"#Automatic-download-(for-most-sources)\" data-toc-modified-id=\"Automatic-download-(for-most-sources)-3.1\"><span class=\"toc-item-num\">3.1&nbsp;&nbsp;</span>Automatic download (for most sources)</a></span></li><li><span><a href=\"#Manual-download\" data-toc-modified-id=\"Manual-download-3.2\"><span class=\"toc-item-num\">3.2&nbsp;&nbsp;</span>Manual download</a></span><ul class=\"toc-item\"><li><span><a href=\"#Energinet.dk\" data-toc-modified-id=\"Energinet.dk-3.2.1\"><span class=\"toc-item-num\">3.2.1&nbsp;&nbsp;</span>Energinet.dk</a></span></li><li><span><a href=\"#CEPS\" data-toc-modified-id=\"CEPS-3.2.2\"><span class=\"toc-item-num\">3.2.2&nbsp;&nbsp;</span>CEPS</a></span></li><li><span><a href=\"#ENTSO-E-Power-Statistics\" data-toc-modified-id=\"ENTSO-E-Power-Statistics-3.2.3\"><span class=\"toc-item-num\">3.2.3&nbsp;&nbsp;</span>ENTSO-E Power Statistics</a></span></li></ul></li></ul></li><li><span><a href=\"#Read\" data-toc-modified-id=\"Read-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Read</a></span><ul class=\"toc-item\"><li><span><a href=\"#Preparations\" data-toc-modified-id=\"Preparations-4.1\"><span class=\"toc-item-num\">4.1&nbsp;&nbsp;</span>Preparations</a></span></li><li><span><a href=\"#Reading-loop\" data-toc-modified-id=\"Reading-loop-4.2\"><span class=\"toc-item-num\">4.2&nbsp;&nbsp;</span>Reading loop</a></span></li><li><span><a href=\"#Save-raw-data\" data-toc-modified-id=\"Save-raw-data-4.3\"><span class=\"toc-item-num\">4.3&nbsp;&nbsp;</span>Save raw data</a></span></li></ul></li><li><span><a href=\"#Processing\" data-toc-modified-id=\"Processing-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>Processing</a></span><ul class=\"toc-item\"><li><span><a href=\"#Missing-data-handling\" data-toc-modified-id=\"Missing-data-handling-5.1\"><span class=\"toc-item-num\">5.1&nbsp;&nbsp;</span>Missing data handling</a></span><ul class=\"toc-item\"><li><span><a href=\"#Interpolation\" data-toc-modified-id=\"Interpolation-5.1.1\"><span class=\"toc-item-num\">5.1.1&nbsp;&nbsp;</span>Interpolation</a></span></li></ul></li><li><span><a href=\"#Country-specific-calculations\" data-toc-modified-id=\"Country-specific-calculations-5.2\"><span class=\"toc-item-num\">5.2&nbsp;&nbsp;</span>Country specific calculations</a></span><ul class=\"toc-item\"><li><span><a href=\"#Calculate-onshore-wind-generation-for-German-TSOs\" data-toc-modified-id=\"Calculate-onshore-wind-generation-for-German-TSOs-5.2.1\"><span class=\"toc-item-num\">5.2.1&nbsp;&nbsp;</span>Calculate onshore wind generation for German TSOs</a></span></li><li><span><a href=\"#Calculate-aggregate-wind-capacity-for-Germany-(on-+-offshore)\" data-toc-modified-id=\"Calculate-aggregate-wind-capacity-for-Germany-(on-+-offshore)-5.2.2\"><span class=\"toc-item-num\">5.2.2&nbsp;&nbsp;</span>Calculate aggregate wind capacity for Germany (on + offshore)</a></span></li><li><span><a href=\"#Aggregate-German-data-from-individual-TSOs-and-calculate-availabilities/profiles\" data-toc-modified-id=\"Aggregate-German-data-from-individual-TSOs-and-calculate-availabilities/profiles-5.2.3\"><span class=\"toc-item-num\">5.2.3&nbsp;&nbsp;</span>Aggregate German data from individual TSOs and calculate availabilities/profiles</a></span></li><li><span><a href=\"#Agregate-Italian-data\" data-toc-modified-id=\"Agregate-Italian-data-5.2.4\"><span class=\"toc-item-num\">5.2.4&nbsp;&nbsp;</span>Agregate Italian data</a></span></li></ul></li><li><span><a href=\"#Fill-columns-not-retrieved-directly-from-TSO-webites-with--ENTSO-E-Transparency-data\" data-toc-modified-id=\"Fill-columns-not-retrieved-directly-from-TSO-webites-with--ENTSO-E-Transparency-data-5.3\"><span class=\"toc-item-num\">5.3&nbsp;&nbsp;</span>Fill columns not retrieved directly from TSO webites with  ENTSO-E Transparency data</a></span></li><li><span><a href=\"#Resample-higher-frequencies-to-60'\" data-toc-modified-id=\"Resample-higher-frequencies-to-60'-5.4\"><span class=\"toc-item-num\">5.4&nbsp;&nbsp;</span>Resample higher frequencies to 60'</a></span></li><li><span><a href=\"#Insert-a-column-with-Central-European-(Summer-)time\" data-toc-modified-id=\"Insert-a-column-with-Central-European-(Summer-)time-5.5\"><span class=\"toc-item-num\">5.5&nbsp;&nbsp;</span>Insert a column with Central European (Summer-)time</a></span></li></ul></li><li><span><a href=\"#Create-a-final-savepoint\" data-toc-modified-id=\"Create-a-final-savepoint-6\"><span class=\"toc-item-num\">6&nbsp;&nbsp;</span>Create a final savepoint</a></span></li><li><span><a href=\"#Write-data-to-disk\" data-toc-modified-id=\"Write-data-to-disk-7\"><span class=\"toc-item-num\">7&nbsp;&nbsp;</span>Write data to disk</a></span><ul class=\"toc-item\"><li><span><a href=\"#Limit-time-range\" data-toc-modified-id=\"Limit-time-range-7.1\"><span class=\"toc-item-num\">7.1&nbsp;&nbsp;</span>Limit time range</a></span></li><li><span><a href=\"#Different-shapes\" data-toc-modified-id=\"Different-shapes-7.2\"><span class=\"toc-item-num\">7.2&nbsp;&nbsp;</span>Different shapes</a></span></li><li><span><a href=\"#Write-to-SQL-database\" data-toc-modified-id=\"Write-to-SQL-database-7.3\"><span class=\"toc-item-num\">7.3&nbsp;&nbsp;</span>Write to SQL-database</a></span></li><li><span><a href=\"#Write-to-Excel\" data-toc-modified-id=\"Write-to-Excel-7.4\"><span class=\"toc-item-num\">7.4&nbsp;&nbsp;</span>Write to Excel</a></span></li><li><span><a href=\"#Write-to-CSV\" data-toc-modified-id=\"Write-to-CSV-7.5\"><span class=\"toc-item-num\">7.5&nbsp;&nbsp;</span>Write to CSV</a></span></li></ul></li><li><span><a href=\"#Create-metadata\" data-toc-modified-id=\"Create-metadata-8\"><span class=\"toc-item-num\">8&nbsp;&nbsp;</span>Create metadata</a></span><ul class=\"toc-item\"><li><span><a href=\"#Write-checksums.txt\" data-toc-modified-id=\"Write-checksums.txt-8.1\"><span class=\"toc-item-num\">8.1&nbsp;&nbsp;</span>Write checksums.txt</a></span></li></ul></li></ul></div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "bba7a58e-f3b2-4d3b-9617-d734c369084f"
    }
   },
   "source": [
    "# Introductory Notes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This Notebook handles missing data, performs calculations and aggragations and creates the output files."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "ba6b62da-6cee-476b-a563-c945f3fd0f79"
    }
   },
   "source": [
    "# Settings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This section performs some preparatory steps."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set version number and recent changes\n",
    "Executing this script till the end will create a new version of the data package.\n",
    "The Version number specifies the local directory for the data <br>\n",
    "We include a note on what has been changed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "version = '2019-05-15'\n",
    "changes = '''Update with 2018 data'''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "2b838df4-f987-4ae4-a132-9c898e3ffab1"
    }
   },
   "source": [
    "## Import Python libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true,
    "nbpresent": {
     "id": "c0035fc6-ff1d-44d8-a3fd-b4c08f53be71"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Python modules\n",
    "from datetime import datetime, date, timedelta, time\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import logging\n",
    "import json\n",
    "import sqlite3\n",
    "import yaml\n",
    "import itertools\n",
    "import os\n",
    "import pytz\n",
    "from shutil import copyfile\n",
    "import pickle\n",
    "\n",
    "# Skripts from time-series repository\n",
    "from timeseries_scripts.read import read\n",
    "from timeseries_scripts.download import download\n",
    "from timeseries_scripts.imputation import find_nan, mark_own_calc\n",
    "from timeseries_scripts.make_json import make_json, get_sha_hash\n",
    "\n",
    "# Reload modules with execution of any code, to avoid having to restart\n",
    "# the kernel after editing timeseries_scripts\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "# speed up tab completion in Jupyter Notebook\n",
    "%config Completer.use_jedi = False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Display options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "# Allow pretty-display of multiple variables\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "\n",
    "# Adjust the way pandas DataFrames a re displayed to fit more columns\n",
    "pd.reset_option('display.max_colwidth')\n",
    "pd.options.display.max_columns = 60\n",
    "# pd.options.display.max_colwidth=5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set directories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "# make sure the working directory is this file's directory\n",
    "try:\n",
    "    os.chdir(home_path)\n",
    "except NameError:\n",
    "    home_path = os.path.realpath('.')\n",
    "\n",
    "# optionally, set a different directory to store outputs and raw data,\n",
    "# which will take up around 15 GB of disk space\n",
    "#Milos: save_path is None <=> use_external_dir == False\n",
    "use_external_dir = True\n",
    "if use_external_dir:\n",
    "    save_path = os.path.join('C:', os.sep, 'OPSD_time_series_data')\n",
    "else:\n",
    "    save_path = home_path\n",
    "\n",
    "input_path = os.path.join(home_path, 'input')\n",
    "sources_yaml_path = os.path.join(home_path, 'input', 'sources.yml')\n",
    "areas_csv_path = os.path.join(home_path, 'input', 'areas.csv')\n",
    "data_path = os.path.join(save_path, version, 'original_data')\n",
    "out_path = os.path.join(save_path, version) \n",
    "temp_path = os.path.join(save_path, 'temp')\n",
    "parsed_path = os.path.join(save_path, 'parsed')\n",
    "for path in [data_path, out_path, temp_path, parsed_path]:\n",
    "    os.makedirs(path, exist_ok=True)\n",
    "\n",
    "# change to temp directory\n",
    "os.chdir(temp_path)\n",
    "os.getcwd()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chromedriver\n",
    "\n",
    "If you want to download from sources which require scraping, download the appropriate version of Chromedriver for your platform, name it `chromedriver`, create folder `chromedriver` in the working directory, and move the driver to it. It is used by `Selenium` to scrape the links from web pages.\n",
    "\n",
    "The current list of sources which require scraping (as of December 2018):\n",
    "  - Terna\n",
    "    - Note that the package contains a database of Terna links up to **20 December 2018**. Bu default, the links are first looked up for in this database, so if the end date of your query is not after **20 December 2018**, you won't need Selenium. In the case that you need later dates, you have two options. If you set the variable `extract_new_terna_urls` to `True`, then Selenium will be used to download the files for those later dates. If you set `extract_new_terna_urls` to `False` (which is the default value), only the recorded links will be consulted and Selenium will not be used.\n",
    "      - Note: Make sure that the database file, `recorded_terna_urls.csv`, is located in the working directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "# Deciding whether to use the provided database of Terna links\n",
    "extract_new_terna_urls = False\n",
    "\n",
    "# Saving the choice\n",
    "f = open(\"extract_new_terna_urls.pickle\", \"wb\")\n",
    "pickle.dump(extract_new_terna_urls, f)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Set up a log"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "# Configure the display of logs in the notebook and attach it to the root logger\n",
    "logstream = logging.StreamHandler()\n",
    "logstream.setLevel(logging.INFO)  #threshold for log messages displayed in here\n",
    "logging.basicConfig(level=logging.INFO, handlers=[logstream])\n",
    "\n",
    "# Set up an additional logger for debug messages from the scripts\n",
    "script_logger = logging.getLogger('timeseries_scripts')\n",
    "script_logger.setLevel(logging.DEBUG)\n",
    "formatter = logging.Formatter(fmt='%(asctime)s %(name)s %(levelname)s %(message)s',\n",
    "                              datefmt='%Y-%m-%d %H:%M:%S',)\n",
    "logfile = logging.handlers.TimedRotatingFileHandler(os.path.join(temp_path, 'logfile.log'), when='midnight')\n",
    "logfile.setFormatter(formatter)\n",
    "logfile.setLevel(logging.DEBUG)   #threshold for log messages in logfile\n",
    "script_logger.addHandler(logfile)\n",
    "\n",
    "# Set up a logger for logs from the notebook\n",
    "logger = logging.getLogger('notebook')\n",
    "logger.addHandler(logfile)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execute for more detailed logging message (May slow down computation)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logstream.setLevel(logging.DEBUG)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "8ed60fdb-284d-43db-b802-8e5c405b8e84"
    }
   },
   "source": [
    "## Select timerange"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This section: select the time range and the data sources for download and read. Default: all data sources implemented, full time range available.\n",
    "\n",
    "**Source parameters** are specified in [input/sources.yml](input/sources.yml), which describes, for each source, the datasets (such as wind and solar generation) alongside all the parameters necessary to execute the downloads.\n",
    "\n",
    "The option to perform downloading and reading of subsets is for testing only. To be able to run the script succesfully until the end, all sources have to be included, or otherwise the script will run into errors (i.e. the step where aggregate German timeseries are caculated requires data from all four German TSOs to be loaded)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to do this, specify the beginning and end of the interval for which to attempt the download.\n",
    "\n",
    "Type `None` to download all available data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true,
    "nbpresent": {
     "id": "f3008d47-ec89-40d0-85ee-776d110f3bb4"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "start_from_user = date(2005, 1, 1)\n",
    "end_from_user = date(2019, 4, 30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select download source"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Instead of downloading from the sources, the complete raw data can be downloaded as a zip file from the OPSD Server. Advantages are:\n",
    "- much faster download\n",
    "- back up of raw data in case it is deleted from the server at the original source\n",
    "\n",
    "In order to do this, specify an archive version to use the raw data from that version that has been cached on the OPSD server as input. All data from that version will be downloaded - timerange and subset will be ignored.\n",
    "\n",
    "Type `None` to download directly from the original sources."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true
   },
   "outputs": [],
   "source": [
    "archive_version = None  # i.e. '2016-07-14'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select subset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read in the configuration file which contains all the required infos for the download."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "with open(sources_yaml_path, 'r', encoding='UTF-8') as f:\n",
    "    sources = yaml.load(f.read())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The next cell prints the available sources and datasets.<br>\n",
    "Copy from its output and paste to following cell to get the right format.<br>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for k, v in sources.items():\n",
    "    print(yaml.dump({k: list(v.keys())}, default_flow_style=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Optionally, specify a subset to download/read.<br>\n",
    "Type `subset = None` to include all data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "subset = yaml.load('''\n",
    "OPSD:\n",
    "- capacity\n",
    "''')\n",
    "subset = None  # to include all sources\n",
    "\n",
    "exclude = yaml.load('''\n",
    "''')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now eliminate sources and datasets not in subset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "with open(sources_yaml_path, 'r', encoding='UTF-8') as f:\n",
    "    sources = yaml.load(f.read())\n",
    "if subset:  # eliminate sources and datasets not in subset\n",
    "    sources = {source_name: \n",
    "                  {k: v for k, v in sources[source_name].items()\n",
    "                        if k in dataset_list}\n",
    "               for source_name, dataset_list in subset.items()}\n",
    "if exclude:  # eliminate sources and variables in exclude\n",
    "    sources = {source_name: dataset_dict\n",
    "               for source_name, dataset_dict in sources.items()\n",
    "               if not source_name in exclude}\n",
    "\n",
    "# Printing the selected sources (all of them or just a subset)\n",
    "print(\"Selected sources: \")\n",
    "for k, v in sources.items():\n",
    "    print(yaml.dump({k: list(v.keys())}, default_flow_style=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": false
   },
   "source": [
    "# Download"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This section: download data. Takes about 1 hour to run for the complete data set (`subset=None`).\n",
    "\n",
    "First, a data directory is created on your local computer. Then, download parameters for each data source are defined, including the URL. These parameters are then turned into a YAML-string. Finally, the download is executed file by file.\n",
    "\n",
    "Each file is saved under it's original filename. Note that the original file names are often not self-explanatory (called \"data\" or \"January\"). The files content is revealed by its place in the directory structure."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some sources (currently only ENTSO-E Transparency) require an account to allow downloading. For ENTSO-E Transparency, set up an account [here](https://transparency.entsoe.eu/usrm/user/createPublicUser)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "auth = yaml.load('''\n",
    "ENTSO-E Transparency FTP:\n",
    "    username: your email\n",
    "    password: your password\n",
    "Elexon:\n",
    "    username: your email\n",
    "    password: your password\n",
    "''')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "e08368cb-021f-453a-b201-a8d48bc8e4c4"
    }
   },
   "source": [
    "## Automatic download (for most sources)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "0c1eb987-6d5f-4e3d-9248-df80b9f37a49"
    }
   },
   "outputs": [],
   "source": [
    "download(sources, data_path, input_path, auth,\n",
    "         archive_version=None,\n",
    "         start_from_user=start_from_user,\n",
    "         end_from_user=end_from_user,\n",
    "         testmode=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "bae7810a-d4af-4021-a9ab-c4b772a2bd53"
    }
   },
   "source": [
    "## Manual download"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Energinet.dk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "a69f8271-e1fc-4e64-814c-4210fb80c006"
    }
   },
   "source": [
    "Go to http://osp.energinet.dk/_layouts/Markedsdata/framework/integrations/markedsdatatemplate.aspx.\n",
    "\n",
    "\n",
    "**Check The Boxes as specified below:**\n",
    "- Periode\n",
    "    - Hent udtræk fra perioden: **01-01-2005** Til: **01-01-2019**\n",
    "    - Select all months\n",
    "- Datakolonner\n",
    "    - Elspot Pris, Valutakode/MWh: **Select all**\n",
    "    - Produktion og forbrug, MWh/h: **Select all**\n",
    "- Udtræksformat\n",
    "    - Valutakode: **EUR**\n",
    "    - Decimalformat: **Engelsk talformat (punktum som decimaltegn**\n",
    "    - Datoformat: **Andet datoformat (ÅÅÅÅ-MM-DD)**\n",
    "    - Hent Udtræk: **Til Excel**\n",
    "\n",
    "Click **Hent Udtræk**\n",
    "\n",
    "You will receive a file `Markedsata.xls` of about 50 MB. Open the file in Excel. There will be a warning from Excel saying that file extension and content are in conflict. Select \"open anyways\" and and save the file as `.xlsx`.\n",
    "\n",
    "In order to be found by the read-function, place the downloaded file in the following subdirectory:  \n",
    "**`{{data_path}}{{os.sep}}Energinet.dk{{os.sep}}prices_wind_solar{{os.sep}}2005-01-01_2019-01-01`**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CEPS\n",
    "\n",
    "Go to http://www.ceps.cz/en/all-data#GenerationRES\n",
    "\n",
    "**check boxes as specified below:**\n",
    "\n",
    "DISPLAY DATA FOR: **Generation RES**  \n",
    "TURN ON FILTER **checked**  \n",
    "FILTER SETTINGS: \n",
    "- Set the date range\n",
    "    - interval\n",
    "    - from: **2012** to: **2019**\n",
    "- Agregation and data version\n",
    "    - Aggregation: **Hour**\n",
    "    - Agregation function: **average (AVG)**\n",
    "    - Data version: **real data**\n",
    "- Filter\n",
    "    - Type of power plant: **ALL**\n",
    "- Click **USE FILTER**\n",
    "- DOWNLOAD DATA: **DATA V TXT**\n",
    "\n",
    "You will receive a file `data.txt` of about 1.5 MB.\n",
    "\n",
    "In order to be found by the read-function, place the downloaded file in the following subdirectory:  \n",
    "**`{{data_path}}{{os.sep}}CEPS{{os.sep}}wind_pv{{os.sep}}2012-01-01_2019-01-01`**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ENTSO-E Power Statistics\n",
    "\n",
    "Go to https://www.entsoe.eu/data/statistics/Pages/monthly_hourly_load.aspx\n",
    "\n",
    "**check boxes as specified below:**\n",
    "\n",
    "- Date From: **01-01-2016** Date To: **28-02-2019**\n",
    "- Country: **(Select All)**\n",
    "- Scale values to 100% using coverage ratio: **YES**\n",
    "- **View Report**\n",
    "- Click the Save symbol and select **Excel**\n",
    "\n",
    "You will receive a file `MHLV.xlsx` of about 8 MB.\n",
    "\n",
    "In order to be found by the read-function, place the downloaded file in the following subdirectory:  \n",
    "**`{{os.sep}}original_data{{os.sep}}ENTSO-E Power Statistics{{os.sep}}load{{os.sep}}2016-01-01_2016-04-30`**\n",
    "\n",
    "The data covers the period from 01-01-2016 up to the present, but 4 months of data seems to be the maximum that interface supports for a single download request, so you have to repeat the download procedure for 4-Month periods to cover the whole period until the present."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "d0b353e7-179d-4556-bdd2-270192c830fb"
    }
   },
   "source": [
    "# Read"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "d223207e-2b0e-49ec-8bf6-af2969ee5b28"
    }
   },
   "source": [
    "This section: Read each downloaded file into a pandas-DataFrame and merge data from different sources if it has the same time resolution. Takes ~15 minutes to run."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "1300cbb2-efde-4844-b08c-fed092023e38"
    }
   },
   "source": [
    "Set the title of the rows at the top of the data used to store metadata internally. The order of this list determines the order of the levels in the resulting output."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true,
    "nbpresent": {
     "id": "4dc92cc3-c01d-4c83-9252-80958edbe0f9"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "headers = ['region', 'variable', 'attribute', 'source', 'web', 'unit']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read a prepared table containing meta data on the geographical areas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "areas = pd.read_csv(areas_csv_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "View the areas table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "areas.loc[areas['area ID'].notnull(), :'EIC'].fillna('')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading loop"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Loop through sources and datasets to do the reading.\n",
    "First read the original CSV, Excel etc. files into pandas DataFrames."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "areas = pd.read_csv(areas_csv_path)\n",
    "\n",
    "read(sources, data_path, parsed_path, areas, headers,\n",
    "     start_from_user=start_from_user, end_from_user=end_from_user,\n",
    "     testmode=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then combine the DataFrames that have the same temporal resolution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Create a dictionary of empty DataFrames to be populated with data\n",
    "data_sets = {'15min': pd.DataFrame(),\n",
    "             '30min': pd.DataFrame(),\n",
    "             '60min': pd.DataFrame()}\n",
    "entso_e = {'15min': pd.DataFrame(),\n",
    "           '30min': pd.DataFrame(),\n",
    "           '60min': pd.DataFrame()}\n",
    "for filename in os.listdir(parsed_path):\n",
    "    res_key, source_name, dataset_name, = filename.split('_')[:3]\n",
    "    if subset and not source_name in subset.keys():\n",
    "        continue\n",
    "    logger.info('include %s', filename)\n",
    "    df_portion = pd.read_pickle(os.path.join(parsed_path, filename))\n",
    "\n",
    "    if source_name == 'ENTSO-E Transparency FTP':\n",
    "        dfs = entso_e\n",
    "    else:\n",
    "        dfs = data_sets\n",
    "\n",
    "    if dfs[res_key].empty:\n",
    "        dfs[res_key] = df_portion\n",
    "    elif not df_portion.empty:\n",
    "        dfs[res_key] = dfs[res_key].combine_first(df_portion)\n",
    "    else:\n",
    "        logger.warning(filename + ' WAS EMPTY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for res_key, df in data_sets.items():\n",
    "    logger.info(res_key + ': %s', df.shape)\n",
    "for res_key, df in entso_e.items():\n",
    "    logger.info('ENTSO-E ' + res_key + ': %s', df.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "be24bc5f-e281-43bf-853a-e7130d20d2e8"
    }
   },
   "source": [
    "Display some rows of the dataframes to get a first impression of the data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_sets['60min'].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save raw data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "019c8ae9-8934-4074-adfd-278acd212152"
    }
   },
   "source": [
    "Save the DataFrames created by the read function to disk. This way you have the raw data to fall back to if something goes wrong in the ramainder of this notebook without having to repeat the previos steps."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(temp_path)\n",
    "data_sets['15min'].to_pickle('raw_data_15.pickle')\n",
    "data_sets['30min'].to_pickle('raw_data_30.pickle')\n",
    "data_sets['60min'].to_pickle('raw_data_60.pickle')\n",
    "entso_e['15min'].to_pickle('raw_entso_e_15.pickle')\n",
    "entso_e['30min'].to_pickle('raw_entso_e_30.pickle')\n",
    "entso_e['60min'].to_pickle('raw_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load the DataFrames saved above"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(temp_path)\n",
    "data_sets = {}\n",
    "data_sets['15min'] = pd.read_pickle('raw_data_15.pickle')\n",
    "data_sets['30min'] = pd.read_pickle('raw_data_30.pickle')\n",
    "data_sets['60min'] = pd.read_pickle('raw_data_60.pickle')\n",
    "entso_e = {}\n",
    "entso_e['15min'] = pd.read_pickle('raw_entso_e_15.pickle')\n",
    "entso_e['30min'] = pd.read_pickle('raw_entso_e_30.pickle')\n",
    "entso_e['60min'] = pd.read_pickle('raw_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "026c6ad8-0422-4887-9361-8e45ae33e0c6"
    }
   },
   "source": [
    "# Processing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "c446157a-7872-491b-a9c4-0336b87568a6"
    }
   },
   "source": [
    "This section: missing data handling, aggregation of sub-national to national data, aggregate 15'-data to 60'-resolution. Takes 30 minutes to run."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "0c13e5eb-d8e3-425d-982e-2a25c364cdc5"
    }
   },
   "source": [
    "## Missing data handling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Interpolation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "81d4d772-d3c7-4e07-a0a7-49afb3bdf8f6"
    }
   },
   "source": [
    "Patch missing data. At this stage, only small gaps (up to 2 hours) are filled by linear interpolation. This catched most of the missing data due to daylight savings time transitions, while leaving bigger gaps untouched\n",
    "\n",
    "The exact locations of missing data are stored in the `nan_table` DataFrames."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "533cd53f-eb8f-4699-bbf7-72ad1cf90325"
    }
   },
   "source": [
    "Patch the datasets and display the location of missing Data in the original data. Takes ~5 minutes to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "33602355-28cb-4d5c-b97e-fe59b9b48883"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "nan_tables = {}\n",
    "overviews = {}\n",
    "for res_key, df in data_sets.items():\n",
    "    data_sets[res_key], nan_tables[res_key], overviews[res_key] = find_nan(\n",
    "        df, res_key, headers, patch=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "33602355-28cb-4d5c-b97e-fe59b9b48883"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for res_key, df in entso_e.items():\n",
    "    entso_e[res_key], nan_tables[res_key + ' ENTSO-E'], overviews[res_key + ' ENTSO-E'] = find_nan(\n",
    "        df, res_key, headers, patch=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "345c31ec-fa0b-4859-a930-8b7c20f1e1bf"
    }
   },
   "source": [
    "Execute this to see an example of where the data has been patched."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data_sets['60min'][data_sets['60min']['interpolated_values'].notnull()].tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "4c5c4893-5098-47df-8e68-51b13f5fd484"
    }
   },
   "source": [
    "Display the table of regions of missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "b5207ca6-4a72-40b6-8afd-3123f66ae323"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "nan_tables['60min']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can export the NaN-tables to Excel in order to inspect where there are NaNs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(temp_path)\n",
    "writer = pd.ExcelWriter('NaN_table.xlsx')\n",
    "for res_key, df in nan_tables.items():\n",
    "    df.to_excel(writer, res_key)\n",
    "writer.save()\n",
    "\n",
    "writer = pd.ExcelWriter('Overview.xlsx')\n",
    "for res_key, df in overviews.items():\n",
    "    df.to_excel(writer, res_key)\n",
    "writer.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save/Load the patched data sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(temp_path)\n",
    "data_sets['15min'].to_pickle('patched_15.pickle')\n",
    "data_sets['30min'].to_pickle('patched_30.pickle')\n",
    "data_sets['60min'].to_pickle('patched_60.pickle')\n",
    "entso_e['15min'].to_pickle('patched_entso_e_15.pickle')\n",
    "entso_e['30min'].to_pickle('patched_entso_e_30.pickle')\n",
    "entso_e['60min'].to_pickle('patched_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(temp_path)\n",
    "data_sets = {}\n",
    "data_sets['15min'] = pd.read_pickle('patched_15.pickle')\n",
    "data_sets['30min'] = pd.read_pickle('patched_30.pickle')\n",
    "data_sets['60min'] = pd.read_pickle('patched_60.pickle')\n",
    "entso_e = {}\n",
    "entso_e['15min'] = pd.read_pickle('patched_entso_e_15.pickle')\n",
    "entso_e['30min'] = pd.read_pickle('patched_entso_e_30.pickle')\n",
    "entso_e['60min'] = pd.read_pickle('patched_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "abd61a95-68e4-4bf2-9f1f-ef010b4cd288"
    },
    "toc-hr-collapsed": false
   },
   "source": [
    "## Country specific calculations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some of the following operations require the Dataframes to be lexsorted in the columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for res_key, df in data_sets.items():\n",
    "    df.sort_index(axis='columns', inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": false
   },
   "source": [
    "### Germany"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "cbcd1b7c-6c9c-4d1a-ab70-28276b421805"
    }
   },
   "source": [
    "#### Aggregate German data from individual TSOs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "9ab0116d-402b-444b-acf2-925388a9886b"
    }
   },
   "source": [
    "The wind and solar in-feed data for the 4 German control areas is summed up and stored in a new column. The column headers are created in the fashion introduced in the read script. Takes 5 seconds to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "26a83b1c-682d-43ec-bb70-12a096089508"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df = data_sets['15min']\n",
    "control_areas_DE = ['DE_50hertz', 'DE_amprion', 'DE_tennet', 'DE_transnetbw']\n",
    "\n",
    "for variable in ['solar', 'wind', 'wind_onshore', 'wind_offshore']:\n",
    "    # we could also include 'generation_forecast'\n",
    "    for attribute in ['generation_actual']:\n",
    "        # Calculate aggregate German generation\n",
    "        sum_frame = df.loc[:, (control_areas_DE, variable, attribute)]\n",
    "        sum_frame.head()        \n",
    "        sum_col = sum_frame.sum(axis='columns', skipna=False).to_frame().round(0)\n",
    "\n",
    "        # Create a new MultiIndex\n",
    "        new_col_header = {\n",
    "            'region': 'DE',\n",
    "            'variable': variable,\n",
    "            'attribute': attribute,\n",
    "            'source': 'own calculation based on German TSOs',\n",
    "            'web': '',\n",
    "            'unit': 'MW'\n",
    "        }\n",
    "        new_col_header = tuple(new_col_header[level] for level in headers)\n",
    "        data_sets['15min'][new_col_header] = sum_col\n",
    "        data_sets['15min'][new_col_header].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Italy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generation data for Italy come by region (North, Central North, Sicily, etc.) and separately for DSO and TSO, so they need to be agregated in order to get values for the whole country. In the next cell, we sum up the data by region and for each variable-attribute pair present in the Terna dataset header."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bidding_zones_IT = ['IT_CNOR', 'IT_CSUD', 'IT_NORD', 'IT_SARD', 'IT_SICI', 'IT_SUD']\n",
    "attributes = ['generation_actual', 'generation_actual_dso', 'generation_actual_tso']\n",
    "\n",
    "for variable in ['solar', 'wind_onshore']:\n",
    "    sum_col = (\n",
    "        data_sets['60min']\n",
    "        .loc[:, (bidding_zones_IT, variable, attributes)]\n",
    "        .sum(axis='columns', skipna=False))\n",
    "    \n",
    "    # Create a new MultiIndex\n",
    "    new_col_header = {\n",
    "        'region': 'IT',\n",
    "        'variable': variable,\n",
    "        'attribute': 'generation_actual',\n",
    "        'source': 'own calculation based on Terna',\n",
    "        'web': 'https://www.terna.it/SistemaElettrico/TransparencyReport/Generation/Forecastandactualgeneration.aspx',\n",
    "        'unit': 'MW'\n",
    "    }\n",
    "    new_col_header = tuple(new_col_header[level] for level in headers)\n",
    "    data_sets['60min'][new_col_header] = sum_col\n",
    "    data_sets['60min'][new_col_header].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Great Britain / United Kingdom"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data for Great Britain (without Northern Ireland) are disaggregated for DSO and TSO connected generators. We calculate aggregate values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for variable in ['solar', 'wind']:\n",
    "    sum_col = (data_sets['30min']\n",
    "               .loc[:, ('GB_GBN', variable, ['generation_actual_dso', 'generation_actual_tso'])]\n",
    "               .sum(axis='columns', skipna=False))\n",
    "    \n",
    "    # Create a new MultiIndex\n",
    "    new_col_header = {\n",
    "        'region' : 'GB_GBN',\n",
    "        'variable' : variable,\n",
    "        'attribute' : 'generation_actual',\n",
    "        'source': 'own calculation based on Elexon and National Grid',\n",
    "        'web': '',\n",
    "        'unit': 'MW'\n",
    "    }\n",
    "    new_col_header = tuple(new_col_header[level] for level in headers)\n",
    "    data_sets['30min'][new_col_header] = sum_col\n",
    "    data_sets['30min'][new_col_header].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculate availabilities/profiles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate profiles, that is, the share of wind/solar capacity producing at a given time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for res_key, df in data_sets.items():\n",
    "    if res_key == '60min':\n",
    "        continue\n",
    "    for col_name, col in df.loc[:,(slice(None), slice(None), 'capacity')].iteritems():\n",
    "        # Calculate the profile column\n",
    "        kwargs = {'key': (col_name[0], col_name[1], 'generation_actual'),\n",
    "            'level': ['region', 'variable', 'attribute'],\n",
    "            'axis': 'columns', 'drop_level': False}\n",
    "        generation_col = df.xs(**kwargs)\n",
    "        # take ENTSO-E transparency data if there is none from TSO\n",
    "        if generation_col.size == 0:\n",
    "            try:\n",
    "                generation_col = entso_e[res_key].xs(**kwargs)\n",
    "            except KeyError:\n",
    "                continue\n",
    "            if generation_col.size == 0:\n",
    "                continue\n",
    "        profile_col = generation_col.divide(col, axis='index').round(4)\n",
    "\n",
    "        # Create a new MultiIndex\n",
    "        new_col_header = {\n",
    "            'region': '{region}',\n",
    "            'variable': '{variable}',\n",
    "            'attribute': 'profile',\n",
    "            'source': 'own calculation based on {source}',\n",
    "            'web': '',\n",
    "            'unit': 'fraction'\n",
    "        }\n",
    "        \n",
    "        source_capacity = col_name[3]\n",
    "        source_generation = generation_col.columns.get_level_values('source')[0]\n",
    "        if source_capacity == source_generation:\n",
    "            source = source_capacity\n",
    "        else:\n",
    "            source = (source_generation + ' and ' + source_capacity).replace('own calculation based on ', '')\n",
    "        new_col_header = tuple(new_col_header[level].format(region=col_name[0], variable=col_name[1], source=source)\n",
    "                                for level in headers)\n",
    "        data_sets[res_key][new_col_header] = profile_col\n",
    "        data_sets[res_key][new_col_header].describe()\n",
    "        \n",
    "        # Append profile to the dataset\n",
    "        df = df.combine_first(profile_col)\n",
    "        new_col_header"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some of the following operations require the Dataframes to be lexsorted in the columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for res_key, df in data_sets.items():\n",
    "    df.sort_index(axis='columns', inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another savepoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(temp_path)\n",
    "data_sets['15min'].to_pickle('calc_15.pickle')\n",
    "data_sets['30min'].to_pickle('calc_30.pickle')\n",
    "data_sets['60min'].to_pickle('calc_60.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(temp_path)\n",
    "data_sets = {}\n",
    "data_sets['15min'] = pd.read_pickle('calc_15.pickle')\n",
    "data_sets['30min'] = pd.read_pickle('calc_30.pickle')\n",
    "data_sets['60min'] = pd.read_pickle('calc_60.pickle')\n",
    "entso_e = {}\n",
    "entso_e['15min'] = pd.read_pickle('patched_entso_e_15.pickle')\n",
    "entso_e['30min'] = pd.read_pickle('patched_entso_e_30.pickle')\n",
    "entso_e['60min'] = pd.read_pickle('patched_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "4e311f14-5124-47fd-9f94-0a866e9a6f71"
    }
   },
   "source": [
    "## Resample higher frequencies to 60'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "882bf163-9e93-44a4-9c51-a3ce0c32b4f4"
    }
   },
   "source": [
    "Some data comes in 15 or 30-minute intervals (i.e. German or British renewable generation), other in 60-minutes (i.e. load data from ENTSO-E and Prices). We resample the 15 and 30-minute data to hourly resolution and append it to the 60-minutes dataset.\n",
    "\n",
    "The `.resample('H').mean()` methods calculates the means from the values for 4 quarter hours [:00, :15, :30, :45] of an hour values, inserts that for :00 and drops the other 3 entries. Takes 15 seconds to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for ds in [data_sets, entso_e]:\n",
    "    for res_key, df in ds.items():\n",
    "        if res_key == '60min':\n",
    "            continue\n",
    "    #    # Resample first the marker column\n",
    "    #    marker_resampled = df['interpolated_values'].groupby(\n",
    "    #        pd.Grouper(freq='60Min', closed='left', label='left')\n",
    "    #    ).agg(resample_markers, drop_region='DE_AT_LU')\n",
    "    #    marker_resampled = marker_resampled.reindex(ds['60min'].index)\n",
    "\n",
    "    #    # Glue condensed 15/30 min marker onto 60 min marker\n",
    "    #    ds['60min'].loc[:, 'interpolated_values'] = glue_markers(\n",
    "    #        ds['60min']['interpolated_values'],\n",
    "    #        marker_resampled.reindex(ds['60min'].index))\n",
    "\n",
    "    #    # Drop DE_AT_LU bidding zone data from the 15 minute resolution data to\n",
    "    #    # be resampled since it is already provided in 60 min resolution by\n",
    "    #    # ENTSO-E Transparency\n",
    "    #    df = df.drop('DE_AT_LU', axis=1, errors='ignore')\n",
    "\n",
    "        # Do the resampling\n",
    "        resampled = df.resample('H').mean()\n",
    "        resampled.columns = resampled.columns.map(mark_own_calc)\n",
    "        resampled.columns.names = headers\n",
    "\n",
    "        # filter out columns already represented in hourly data\n",
    "        data_cols = ds['60min'].columns.droplevel(['source', 'web', 'unit'])\n",
    "        tuples = [col for col in resampled.columns if not col[:3] in data_cols]\n",
    "        add_cols = pd.MultiIndex.from_tuples(tuples, names=headers)\n",
    "        resampled = resampled[add_cols]\n",
    "        \n",
    "        # Round the resampled columns\n",
    "        for col in resampled.columns:\n",
    "            if col[2] == 'profile':\n",
    "                resampled.loc[:, col] = resampled.loc[:, col].round(4)\n",
    "            else:\n",
    "                resampled.loc[:, col] = resampled.loc[:, col].round(0)\n",
    "\n",
    "        ds['60min'] = ds['60min'].combine_first(resampled)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fill columns not retrieved directly from TSO webites with  ENTSO-E Transparency data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "c19e3eb9-feca-4e94-9035-580aa07189ba"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data_cols = data_sets['60min'].columns.droplevel(['source', 'web', 'unit'])\n",
    "\n",
    "for res_key, df in entso_e.items():\n",
    "     # Combine with TSO data\n",
    "\n",
    "#     # Copy entire 30min data from ENTSO-E if there is no data from TSO\n",
    "    if data_sets[res_key].empty:\n",
    "         data_sets[res_key] = df\n",
    "\n",
    "    else:\n",
    "        # Keep only region, variable, attribute in MultiIndex for comparison\n",
    "        # Compare columns from ENTSO-E against TSO's, keep which we don't have yet\n",
    "        cols = [col for col in df.columns if not col[:3] in data_cols]\n",
    "        add_cols = pd.MultiIndex.from_tuples(cols, names=headers)\n",
    "        data_sets[res_key] = data_sets[res_key].combine_first(df[add_cols])\n",
    "\n",
    "#         # Add the ENTSO-E markers (but only for the columns actually copied)\n",
    "#         add_cols = ['_'.join(col[:3]) for col in tuples]\n",
    "#         # Spread marker column out over a DataFrame for easiser comparison\n",
    "#         # Filter out everey second column, which contains the delimiter \" | \"\n",
    "#         # from the marker\n",
    "#         marker_table = (df['interpolated_values'].str.split(' | ', expand=True)\n",
    "#                         .filter(regex='^\\d*[02468]$', axis='columns'))\n",
    "#         # Replace cells with markers marking columns not copied with NaNs\n",
    "#         marker_table[~marker_table.isin(add_cols)] = np.nan\n",
    "\n",
    "#         for col_name, col in marker_table.iteritems():\n",
    "#             if col_name == 0:\n",
    "#                 marker_entso_e = col\n",
    "#             else:\n",
    "#                 marker_entso_e = glue_markers(marker_entso_e, col)\n",
    "\n",
    "#         # Glue ENTSO-E marker onto our old marker\n",
    "#         marker = data_sets[res_key]['interpolated_values']\n",
    "#         data_sets[res_key].loc[:, 'interpolated_values'] = glue_markers(\n",
    "#             marker, df['interpolated_values'].reindex(marker.index))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "dea911fb-326f-46a6-a009-57af147d4be4"
    }
   },
   "source": [
    "## Insert a column with Central European (Summer-)time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The index column of th data sets defines the start of the timeperiod represented by each row of that data set in **UTC** time. We include an additional column for the **CE(S)T** Central European (Summer-) Time, as this might help aligning the output data with other data sources."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "init_cell": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "info_cols = {'utc': 'utc_timestamp',\n",
    "             'cet': 'cet_cest_timestamp'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "b1550779-53cc-498d-980b-7aa253974c91"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for ds in [data_sets, entso_e]:\n",
    "    for res_key, df in ds.items():\n",
    "        if df.empty:\n",
    "            continue\n",
    "        df.index.rename(info_cols['utc'], inplace=True)\n",
    "        df.insert(0, info_cols['cet'],\n",
    "                  df.index.tz_localize('UTC').tz_convert('CET'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "4e625ea7-c448-45fe-85b2-026157ad24c0"
    }
   },
   "source": [
    "# Create a final savepoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data_sets['15min'].to_pickle('final_15.pickle')\n",
    "data_sets['30min'].to_pickle('final_30.pickle')\n",
    "data_sets['60min'].to_pickle('final_60.pickle')\n",
    "#entso_e['15min'].to_pickle('final_entso_e_15.pickle')\n",
    "#entso_e['30min'].to_pickle('final_entso_e_30.pickle')\n",
    "#entso_e['60min'].to_pickle('final_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(temp_path)\n",
    "data_sets = {}\n",
    "data_sets['15min'] = pd.read_pickle('final_15.pickle')\n",
    "data_sets['30min'] = pd.read_pickle('final_30.pickle')\n",
    "data_sets['60min'] = pd.read_pickle('final_60.pickle')\n",
    "#entso_e = {}\n",
    "#entso_e['15min'] = pd.read_pickle('final_entso_e_15.pickle')\n",
    "#entso_e['30min'] = pd.read_pickle('final_entso_e_30.pickle')\n",
    "#entso_e['60min'] = pd.read_pickle('final_entso_e_60.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined = data_sets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Show the column names contained in the final DataFrame in a table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "col_info = pd.DataFrame()\n",
    "df = combined['60min']\n",
    "for level in df.columns.names:\n",
    "    col_info[level] = df.columns.get_level_values(level)\n",
    "\n",
    "col_info"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "fa919796-a7f6-4556-aeed-181ddc6028ac"
    }
   },
   "source": [
    "# Write data to disk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "1b3c9199-ce14-4487-939d-656f60c14df3"
    }
   },
   "source": [
    "This section: Save as [Data Package](http://data.okfn.org/doc/tabular-data-package) (data in CSV, metadata in JSON file). All files are saved in the directory of this notebook. Alternative file formats (SQL, XLSX) are also exported. Takes about 1 hour to run."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Limit time range\n",
    "Cut off the data outside of `[start_from_user:end_from_user]`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for res_key, df in combined.items():\n",
    "    # In order to make sure that the respective time period is covered in both\n",
    "    # UTC and CE(S)T, we set the start in CE(S)T, but the end in UTC\n",
    "    if start_from_user:\n",
    "        start_from_user = (\n",
    "            pytz.timezone('Europe/Brussels')\n",
    "            .localize(datetime.combine(start_from_user, time()))\n",
    "            .astimezone(pytz.timezone('UTC')))\n",
    "    if end_from_user:\n",
    "        end_from_user = (\n",
    "            pytz.timezone('UTC')\n",
    "            .localize(datetime.combine(end_from_user, time()))\n",
    "            # Appropriate offset to inlude the end of period\n",
    "            + timedelta(days=1, minutes=-int(res_key[:2])))\n",
    "    # Then cut off the data_set\n",
    "    data_sets[res_key] = df.loc[start_from_user:end_from_user, :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "47c1efa2-d93f-4d13-81d7-8f64dadeff3f"
    }
   },
   "source": [
    "## Different shapes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "a753ac43-a0f4-44bc-a89d-1ccaaf48289a"
    }
   },
   "source": [
    "Data are provided in three different \"shapes\": \n",
    "- SingleIndex (easy to read for humans, compatible with datapackage standard, small file size)\n",
    "  - Fileformat: CSV, SQLite\n",
    "- MultiIndex (easy to read into GAMS, not compatible with datapackage standard, small file size)\n",
    "  - Fileformat: CSV, Excel\n",
    "- Stacked (compatible with data package standard, large file size, many rows, too many for Excel) \n",
    "  - Fileformat: CSV\n",
    "\n",
    "The different shapes need to be created internally befor they can be saved to files. Takes about 1 minute to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "769225c6-31f5-4db8-8d91-32a3f983489c"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "combined_singleindex = {}\n",
    "combined_multiindex = {}\n",
    "combined_stacked = {}\n",
    "for res_key, df in combined.items():\n",
    "    if df.empty:\n",
    "        continue\n",
    "\n",
    "#    # Round floating point numbers to 2 digits\n",
    "#    for col_name, col in df.iteritems():\n",
    "#        if col_name[0] in info_cols.values():\n",
    "#            pass\n",
    "#        elif col_name[2] == 'profile':\n",
    "#            df[col_name] = col.round(4)\n",
    "#        else:\n",
    "#            df[col_name] = col.round(3)\n",
    "\n",
    "    # MultIndex\n",
    "    combined_multiindex[res_key + '_multiindex'] = df\n",
    "\n",
    "    # SingleIndex\n",
    "    df_singleindex = df.copy()\n",
    "    # use first 3 levels of multiindex to create singleindex\n",
    "    df_singleindex.columns = [\n",
    "        col_name[0] if col_name[0] in info_cols.values()\n",
    "        else '_'.join([level for level in col_name[0:3] if not level == ''])\n",
    "        for col_name in df.columns.values]\n",
    "\n",
    "    combined_singleindex[res_key + '_singleindex'] = df_singleindex\n",
    "\n",
    "    # Stacked\n",
    "    stacked = df.copy().drop(columns=info_cols['cet'], level=0)\n",
    "    stacked.columns = stacked.columns.droplevel(['source', 'web', 'unit'])\n",
    "    # Concatenate all columns below each other (=\"stack\").\n",
    "    # df.transpose().stack() is faster than stacking all column levels\n",
    "    # seperately\n",
    "    stacked = stacked.transpose().stack(dropna=True).to_frame(name='data')\n",
    "    combined_stacked[res_key + '_stacked'] = stacked"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "84f1822e-3aa6-42c4-a424-5dc5ab6fa56f"
    }
   },
   "source": [
    "## Write to SQLite-database"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "3bb78fa9-5309-46b6-b945-68dcb654a567"
    }
   },
   "source": [
    "This file format is required for the filtering function on the OPSD website. This takes ~3 minutes to complete."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "fd35212c-ec5c-4fcf-9897-4608742d1bf8"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(out_path)\n",
    "for res_key, df in combined_singleindex.items():\n",
    "    table = 'time_series_' + res_key\n",
    "    df = df.copy()\n",
    "    df.index = df.index.strftime('%Y-%m-%dT%H:%M:%SZ')\n",
    "    cet_col_name = info_cols['cet']\n",
    "    df[cet_col_name] = (df[cet_col_name].dt.strftime('%Y-%m-%dT%H:%M:%S%z'))\n",
    "    df.to_sql(table, sqlite3.connect('time_series.sqlite'),\n",
    "              if_exists='replace', index_label=info_cols['utc'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "143b39aa-dd2e-4923-be56-bb6c4706837d"
    }
   },
   "source": [
    "## Write to Excel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "ef7763f5-9bbc-40b8-8cee-829131b40336"
    }
   },
   "source": [
    "Writing the full tables to Excel takes extremely long. As a workaround, only the timestamp-columns are exported. The rest of the data can than be inserted manually from the `_multindex.csv` files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "9e84c62a-7bd6-4319-89dd-409574dda234"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "os.chdir(out_path)\n",
    "writer = pd.ExcelWriter('time_series.xlsx')\n",
    "for res_key, df in data_sets.items():\n",
    "    # Need to convert CE(S)T-timestamps to tz-naive, otherwise Excel converts\n",
    "    # them back to UTC\n",
    "    df.loc[:,(info_cols['cet'], '', '', '', '', '')].dt.tz_localize(None).to_excel(writer, res_key)\n",
    "    filename = 'tsos_' + res_key + '.csv'\n",
    "    df.to_csv(filename, float_format='%.4f', date_format='%Y-%m-%dT%H:%M:%SZ')\n",
    "for res_key, df in entso_e.items():\n",
    "    df.loc[:,(info_cols['cet'], '', '', '', '', '')].dt.tz_localize(None).to_excel(writer, res_key+ ' ENTSO-E')\n",
    "    filename = 'entso_e_' + res_key + '.csv'\n",
    "    df.to_csv(filename, float_format='%.4f', date_format='%Y-%m-%dT%H:%M:%SZ')\n",
    "writer.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "454ee5f5-e8f1-4088-94e9-e846f48ee75b"
    }
   },
   "source": [
    "## Write to CSV"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "279e3306-6dea-454d-bec8-0c826509ecd1"
    }
   },
   "source": [
    "This takes about 10 minutes to complete."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "nbpresent": {
     "id": "89449c49-608d-488d-8bc8-077c64bc26c7"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "os.chdir(out_path)\n",
    "# itertoools.chain() allows iterating over multiple dicts at once\n",
    "for res_stacking_key, df in itertools.chain(\n",
    "    combined_singleindex.items(),\n",
    "    combined_multiindex.items(),\n",
    "    combined_stacked.items()):\n",
    "\n",
    "    df = df.copy()\n",
    "\n",
    "    # convert the format of the cet_cest-timestamp to ISO-8601\n",
    "    if not res_stacking_key.split('_')[1] == 'stacked':\n",
    "        df.iloc[:, 0] = df.iloc[:, 0].dt.strftime('%Y-%m-%dT%H:%M:%S%z')  # https://frictionlessdata.io/specs/table-schema/#date\n",
    "    filename = 'time_series_' + res_stacking_key + '.csv'\n",
    "    df.to_csv(filename, float_format='%.4f',\n",
    "              date_format='%Y-%m-%dT%H:%M:%SZ')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create metadata"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "nbpresent": {
     "id": "38b6a454-31c4-4112-971f-da8271131d54"
    }
   },
   "source": [
    "This section: create the metadata, both general and column-specific. All metadata we be stored as a JSON file. Takes 10s to run."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "os.chdir(out_path)\n",
    "make_json(combined, info_cols, version, changes, headers, areas,\n",
    "          start_from_user, end_from_user)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write checksums.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We publish SHA-checksums for the outputfiles on GitHub to allow verifying the integrity of outputfiles on the OPSD server."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(out_path)\n",
    "files = os.listdir(out_path)\n",
    "\n",
    "# Create checksums.txt in the output directory\n",
    "with open('checksums.txt', 'w') as f:\n",
    "    for file_name in files:\n",
    "        if file_name.split('.')[-1] in ['csv', 'sqlite', 'xlsx']:\n",
    "            file_hash = get_sha_hash(file_name)\n",
    "            f.write('{},{}\\n'.format(file_name, file_hash))\n",
    "\n",
    "# Copy the file to root directory from where it will be pushed to GitHub,\n",
    "# leaving a copy in the version directory for reference\n",
    "copyfile('checksums.txt', os.path.join(home_path, 'checksums.txt'))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "celltoolbar": "Initialization Cell",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "nbpresent": {
   "slides": {
    "f6b300bf-88b5-4dea-951e-c926a9ea8287": {
     "id": "f6b300bf-88b5-4dea-951e-c926a9ea8287",
     "prev": "f96dd4bc-93a6-4014-b85f-a43061cf5688",
     "regions": {
      "dc486e18-7547-4610-99c0-55dfb5553f62": {
       "attrs": {
        "height": 1,
        "width": 1,
        "x": 0,
        "y": 0
       },
       "content": {
        "cell": "c0035fc6-ff1d-44d8-a3fd-b4c08f53be71",
        "part": "source"
       },
       "id": "dc486e18-7547-4610-99c0-55dfb5553f62"
      }
     }
    },
    "f96dd4bc-93a6-4014-b85f-a43061cf5688": {
     "id": "f96dd4bc-93a6-4014-b85f-a43061cf5688",
     "prev": null,
     "regions": {
      "657c3ad3-2fcf-4c8e-a527-de3d0a46fa4e": {
       "attrs": {
        "height": 1,
        "width": 1,
        "x": 0,
        "y": 0
       },
       "content": {
        "cell": "1562965a-7d74-4c1c-8251-4d82847f294a",
        "part": "source"
       },
       "id": "657c3ad3-2fcf-4c8e-a527-de3d0a46fa4e"
      }
     }
    }
   },
   "themes": {}
  },
  "notify_time": "10",
  "toc": {
   "base_numbering": "1",
   "nav_menu": {
    "height": "512px",
    "width": "252px"
   },
   "number_sections": true,
   "sideBar": false,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "660px",
    "left": "59px",
    "right": "2296px",
    "top": "110px",
    "width": "266px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  },
  "toc-autonumbering": false,
  "toc-showcode": false,
  "toc-showmarkdowntxt": false,
  "toc-showtags": false,
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "1101px",
    "left": "2009px",
    "right": "67px",
    "top": "127px",
    "width": "484px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}