{ "cells": [ { "cell_type": "markdown", "id": "b45644b4", "metadata": {}, "source": [ "This notebook serves as explanation for a stackoverflow solution:\n", "\n", "\n", "# See https://stackoverflow.com/a/64832684/333403\n", "\n", "<br><br><br>\n", "\n", "More answers can be found at <https://stackoverflow.com/search?q=user:333403+[pandas]>.\n", "\n", "Please remember: **upvoting** answers keeps people motivated. (And reputation helps to place bounties and thus get answers to hard questions).\n", "\n", "<br><br><br>" ] }, { "cell_type": "code", "execution_count": 1, "id": "b45eff63", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sun Dec 25 10:54:25 2022\n" ] } ], "source": [ "import time\n", "print(time.ctime())\n", "\n", "try:\n", " import ipydex\n", " # this optional and enables the magic comment `##:` to view intermediate results\n", " %load_ext ipydex.displaytools\n", "except ImportError:\n", " pass" ] }, { "cell_type": "code", "execution_count": 2, "id": "aa1fa3a7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Timestamp</th>\n", " <th>Austria</th>\n", " <th>Belgium</th>\n", " <th>France</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>1993-11-01</td>\n", " <td>6.18</td>\n", " <td>7.05</td>\n", " <td>7.69</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1993-11-02</td>\n", " <td>6.18</td>\n", " <td>7.05</td>\n", " <td>7.61</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1993-11-03</td>\n", " <td>6.17</td>\n", " <td>7.20</td>\n", " <td>7.67</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1993-11-04</td>\n", " <td>6.17</td>\n", " <td>7.50</td>\n", " <td>7.91</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>1993-11-15</td>\n", " <td>6.40</td>\n", " <td>7.60</td>\n", " <td>8.61</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "df := Timestamp Austria Belgium France\n", "1 1993-11-01 6.18 7.05 7.69\n", "2 1993-11-02 6.18 7.05 7.61\n", "3 1993-11-03 6.17 7.20 7.67\n", "4 1993-11-04 6.17 7.50 7.91\n", "5 1993-11-15 6.40 7.60 8.61" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# original data\n", "\n", "import numpy as np\n", "import pandas as pd\n", "df = pd.DataFrame(\n", " {\"Timestamp\" : ['1993-11-01' ,'1993-11-02', '1993-11-03', '1993-11-04','1993-11-15'],\n", " \"Austria\" : [6.18 ,6.18, 6.17, 6.17, 6.40],\n", " \"Belgium\" : [7.05, 7.05, 7.2, 7.5, 7.6],\"France\" : [7.69, 7.61, 7.67, 7.91, 8.61]\n", " },\n", " index = [1, 2, 3,4,5]\n", ") ##:\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "f89a5ee2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "cols := ['Austria', 'Belgium', 'France']" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "col_pairs := [('Austria', 'Belgium'), ('Austria', 'France'), ('Belgium', 'France')]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import itertools as it\n", "\n", "# omit timestamp-col\n", "cols = list(df.columns)[1:] ##:\n", "\n", "col_pairs = list(it.combinations(cols, 2)) ##:\n", "\n", "# new data frame (yet empty) \n", "res = pd.DataFrame()" ] }, { "cell_type": "code", "execution_count": 4, "id": "df494994", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Aus_Bel_corr Aus_Fra_corr Bel_Fra_corr\n", "0 NaN NaN NaN\n", "1 NaN NaN NaN\n", "2 -1.000000 -0.277350 0.277350\n", "3 -0.755929 -0.654654 0.989743\n", "4 0.693375 0.969346 0.849167\n" ] } ], "source": [ "# complete solution\n", "\n", "for pair in col_pairs:\n", " # select the first three letters of each name of the pair\n", " corr_name = f\"{pair[0][:3]}_{pair[1][:3]}_corr\"\n", " res[corr_name] = df[list(pair)].\\\n", " rolling(min_periods=1, window=3).\\\n", " corr().iloc[0::2, -1].reset_index(drop=True)\n", "\n", "print(str(res))" ] }, { "cell_type": "code", "execution_count": 5, "id": "38b32962", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pair := ('Austria', 'Belgium')" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "corr_name := 'Aus_Bel_corr'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# one iteration (i.e. one column):\n", "\n", "pair = col_pairs[0] ##:\n", "corr_name = f\"{pair[0][:3]}_{pair[1][:3]}_corr\" ##:" ] }, { "cell_type": "code", "execution_count": 6, "id": "857e51c1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Austria</th>\n", " <th>Belgium</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>6.18</td>\n", " <td>7.05</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>6.18</td>\n", " <td>7.05</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>6.17</td>\n", " <td>7.20</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>6.17</td>\n", " <td>7.50</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>6.40</td>\n", " <td>7.60</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "tmp := Austria Belgium\n", "1 6.18 7.05\n", "2 6.18 7.05\n", "3 6.17 7.20\n", "4 6.17 7.50\n", "5 6.40 7.60" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# select the corresponding columns\n", "tmp = df[list(pair)] ##:" ] }, { "cell_type": "code", "execution_count": 7, "id": "1950a772", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tmp2 := Rolling [window=3,min_periods=1,center=False,axis=0,method=single]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# this rolling objects does not yet specify the operation\n", "tmp2 = tmp.rolling(min_periods=1, window=3) ##:" ] }, { "cell_type": "code", "execution_count": 8, "id": "71b32979", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Austria</th>\n", " <th>Belgium</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">1</th>\n", " <th>Austria</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>Belgium</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">2</th>\n", " <th>Austria</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>Belgium</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">3</th>\n", " <th>Austria</th>\n", " <td>1.000000</td>\n", " <td>-1.000000</td>\n", " </tr>\n", " <tr>\n", " <th>Belgium</th>\n", " <td>-1.000000</td>\n", " <td>1.000000</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">4</th>\n", " <th>Austria</th>\n", " <td>1.000000</td>\n", " <td>-0.755929</td>\n", " </tr>\n", " <tr>\n", " <th>Belgium</th>\n", " <td>-0.755929</td>\n", " <td>1.000000</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">5</th>\n", " <th>Austria</th>\n", " <td>1.000000</td>\n", " <td>0.693375</td>\n", " </tr>\n", " <tr>\n", " <th>Belgium</th>\n", " <td>0.693375</td>\n", " <td>1.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "tmp3 := Austria Belgium\n", "1 Austria NaN NaN\n", " Belgium NaN NaN\n", "2 Austria NaN NaN\n", " Belgium NaN NaN\n", "3 Austria 1.000000 -1.000000\n", " Belgium -1.000000 1.000000\n", "4 Austria 1.000000 -0.755929\n", " Belgium -0.755929 1.000000\n", "5 Austria 1.000000 0.693375\n", " Belgium 0.693375 1.000000" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# now specify the operation to `corr`\n", "tmp3 = tmp2.corr() ##:" ] }, { "cell_type": "code", "execution_count": 9, "id": "60e6a811", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([(1, 'Austria'),\n", " (1, 'Belgium'),\n", " (2, 'Austria'),\n", " (2, 'Belgium'),\n", " (3, 'Austria'),\n", " (3, 'Belgium'),\n", " (4, 'Austria'),\n", " (4, 'Belgium'),\n", " (5, 'Austria'),\n", " (5, 'Belgium')],\n", " )" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The previous result contains to much data. We are interested only in every second row.\n", "# It also has a \"messed up\" index:\n", "tmp3.index" ] }, { "cell_type": "code", "execution_count": 10, "id": "0d758cf3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Purely integer-location based indexing for selection by position.\n", "\n", "``.iloc[]`` is primarily integer position based (from ``0`` to\n", "``length-1`` of the axis), but may also be used with a boolean\n", "array.\n", "\n", "Allowed inputs are:\n", "\n", "- An integer, e.g. ``5``.\n", "- A list or array of integers, e.g. ``[4, 3, 0]``.\n", "- A slice object with ints, e.g. ``1:7``.\n", "- A boolean array.\n", "- A ``callable`` function with one argument (the calling Series or\n", " DataFrame) and that returns valid output for indexing (one of the above).\n", " This is useful in method chains, when you don't have a reference to the\n", " calling object, but would like to base your selection on some value.\n", "\n", "``.iloc`` will raise ``IndexError`` if a requested indexer is\n", "out-of-bounds, except *slice* indexers which allow out-of-bounds\n", "indexing (this conforms\n" ] } ], "source": [ "# To only select every second row, we use the iloc property which is documentend here:\n", "# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html\n", "\n", "# in short: it allows access to the dataframe cells by purely integer indexing (without using column names etc.)\n", "\n", "print(tmp3.iloc.__doc__[:800])" ] }, { "cell_type": "code", "execution_count": 11, "id": "e9a21170", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tmp4 :=\n", "1 Austria NaN\n", "2 Austria NaN\n", "3 Austria -1.000000\n", "4 Austria -0.755929\n", "5 Austria 0.693375\n", "Name: Belgium, dtype: float64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(type(tmp4)) := pandas.core.series.Series" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# the slicing-syntax of iloc is like for numpy arrays: `start:stop:step` for rows and columns \n", "# `0::2` means: beginning with row index 0 until the end (no stop index) with a stepsizt of 2\n", "# the `-1` means: just select the last column (index -1), no colon -> no slice\n", "\n", "# for more information see: https://numpy.org/doc/stable/user/basics.indexing.html#slicing-and-striding\n", "\n", "tmp4 = tmp3.iloc[0::2, -1] ##:\\n\n", "type(tmp4) ##:" ] }, { "cell_type": "code", "execution_count": 12, "id": "5a0c1e94", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tmp5 :=\n", "0 NaN\n", "1 NaN\n", "2 -1.000000\n", "3 -0.755929\n", "4 0.693375\n", "Name: Belgium, dtype: float64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "---" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# finally, we install a new index beginning at 0:\n", "tmp5 = tmp4.reset_index(drop=True)##:\\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 5 }