{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# pandas Dataframe - Basic Operativity" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import addutils.toc ; addutils.toc.js(ipy_notebook=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from addutils import css_notebook\n", "css_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See [pandas documentation]() for more information and examples. Run the code at the end of the Notebook to generate the example files." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from numpy import NaN\n", "from addutils import side_by_side2\n", "from addutils import css_notebook\n", "from addutils import read_txt\n", "from IPython.core.display import HTML\n", "from faker import Factory\n", "css_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 File I/O and DataFrame Generation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pandas reads and format data from many different file formats: `txt, csv, web, xls, mat`. In this case we use `read_csv` to read two textual data files.\n", "\n", "First have a look to the file in its original form:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Date,AAPL,GOOG,JNJ,XOM\n", "\n", "2015-09-21,115.209999,635.440002,93.129997,73.389999\n", "\n", "2015-09-22,113.400002,622.690002,93.239998,72.739998\n", "\n", "2015-09-23,114.32,622.359985,92.989998,72.300003\n", "\n", "2015-09-24,115.0,625.799988,92.480003,72.730003\n", "\n" ] } ], "source": [ "read_txt('temp/p01_prices.txt')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 Create DataFrames with read_csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This file can be read and formatted at the same time using `read_csv`. Lets read the two files `p01_prices.txt` and `p01_volumes.txt`" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGJNJXOM
Date
2015-09-21115.209999635.44000293.12999773.389999
2015-09-22113.400002622.69000293.23999872.739998
2015-09-23114.320000622.35998592.98999872.300003
2015-09-24115.000000625.79998892.48000372.730003
2015-09-25114.709999611.96997191.00000073.230003
2015-09-28112.440002594.89001591.37000372.599998
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLJNJXOM
Date
2015-09-2146554300697140010585500
2015-09-22498090001060780014104200
2015-09-2335645700559730013777500
2015-09-2449810600717840014283800
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prices = pd.read_csv('temp/p01_prices.txt', index_col=0, parse_dates=[0])\n", "volumes = pd.read_csv('temp/p01_volumes.txt', index_col=0, parse_dates=[0])\n", "\n", "HTML(side_by_side2(prices, volumes))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Both \"prices\" and \"volumes\" datasets are 2D DataFrame objects:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(prices)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2 Create DataFrames from Python Dictionaries" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fakeIT = Factory.create('it_IT')\n", "data = {'Name' : [fakeIT.name() for i in range(5)],\n", " 'Company' : [fakeIT.company() for i in range(5)],\n", " 'City' : [fakeIT.city() for i in range(5)]}" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameCompanyCity
0Ruth MilaniBasile-Gentile SPAQuarto Elda
1Albino MarchettiDe luca SPADonati calabro
2Vienna CattaneoFerretti, Coppola e Colombo SPAGioacchino sardo
3Elga VitaleBianchi, Battaglia e Fontana e figliSan Cecco umbro
4Diana GrecoLombardi s.r.l.Borgo Sarita
\n", "
" ], "text/plain": [ " Name Company City\n", "0 Ruth Milani Basile-Gentile SPA Quarto Elda\n", "1 Albino Marchetti De luca SPA Donati calabro\n", "2 Vienna Cattaneo Ferretti, Coppola e Colombo SPA Gioacchino sardo\n", "3 Elga Vitale Bianchi, Battaglia e Fontana e figli San Cecco umbro\n", "4 Diana Greco Lombardi s.r.l. Borgo Sarita" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(data, columns = ['Name','Company','City'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3 Create DataFrames from Items" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameCompany
0Dott. Fatima CarboneCaputo s.r.l.
1Sig. Alighiero SorrentinoDe rosa Group
2Sig. Pablo MontiCattaneo SPA
3Demian PalmieriMazza-Martinelli SPA
4Sig.ra Loretta MartinoConte-Sartori Group
\n", "
" ], "text/plain": [ " Name Company\n", "0 Dott. Fatima Carbone Caputo s.r.l.\n", "1 Sig. Alighiero Sorrentino De rosa Group\n", "2 Sig. Pablo Monti Cattaneo SPA\n", "3 Demian Palmieri Mazza-Martinelli SPA\n", "4 Sig.ra Loretta Martino Conte-Sartori Group" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame.from_items([('Name', [fakeIT.name() for i in range(5)]),\n", " ('Company', [fakeIT.company() for i in range(5)])])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.4 Create DataFrames fron Numpy Arrays" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ONETWO
a23
b56
\n", "
" ], "text/plain": [ " ONE TWO\n", "a 2 3\n", "b 5 6" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.array([[2,5],[3,6]]).T, index=list('ab'), columns=['ONE','TWO'])\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.5 DataFrames can be converted in Numpy Arrays" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[2, 3],\n", " [5, 6]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.asarray(df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.6 DataFrames, Series and Panels" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In Pandas there are 3 main data structure types (\"data container\" objects): \n", "\n", "* **Series:** for one-dimensional data \n", "* **DataFrames:** for bi-dimensional data (matrices) \n", "* **Panels:** for 3D or nD data \n", "\n", "For simplicity, in this course we will describe only pandas Series and DataFrames." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2 Automatic Data Alignment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As you can see the dates has been interpreted correctly and used as row index. Notice that the rows in the two datafiles are misaligned, this is not a problem in pandas because the *Automatic Data Alignment* feature: an operation involving the two datasets will simply use `NaN` for the undefined (misaligned) values" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGJNJXOM
Date
2015-09-215.363521e+09NaN6.492465e+087.768698e+08
2015-09-225.648341e+09NaN9.890713e+081.025939e+09
2015-09-234.075016e+09NaN5.204929e+089.961133e+08
2015-09-245.728219e+09NaN6.638585e+081.038861e+09
2015-09-25NaNNaNNaNNaN
2015-09-28NaNNaNNaNNaN
\n", "
" ], "text/plain": [ " AAPL GOOG JNJ XOM\n", "Date \n", "2015-09-21 5.363521e+09 NaN 6.492465e+08 7.768698e+08\n", "2015-09-22 5.648341e+09 NaN 9.890713e+08 1.025939e+09\n", "2015-09-23 4.075016e+09 NaN 5.204929e+08 9.961133e+08\n", "2015-09-24 5.728219e+09 NaN 6.638585e+08 1.038861e+09\n", "2015-09-25 NaN NaN NaN NaN\n", "2015-09-28 NaN NaN NaN NaN" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prices*volumes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Which can be better formatted to a \"2 decimal places float number\" with comma as thousands separator (see Package Options):" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGJNJXOM
Date
2015-09-215,363,520,856.4-649,246,461.1776,869,834.4
2015-09-225,648,340,699.6-989,071,250.81,025,939,479.8
2015-09-234,075,016,424.0-520,492,915.8996,113,291.3
2015-09-245,728,219,000.0-663,858,453.51,038,860,816.9
2015-09-25----
2015-09-28----
\n", "
" ], "text/plain": [ " AAPL GOOG JNJ XOM\n", "Date \n", "2015-09-21 5,363,520,856.4 - 649,246,461.1 776,869,834.4\n", "2015-09-22 5,648,340,699.6 - 989,071,250.8 1,025,939,479.8\n", "2015-09-23 4,075,016,424.0 - 520,492,915.8 996,113,291.3\n", "2015-09-24 5,728,219,000.0 - 663,858,453.5 1,038,860,816.9\n", "2015-09-25 - - - -\n", "2015-09-28 - - - -" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option('display.float_format', lambda x: '{:,.1f}'.format(x)) # formatting\n", "(prices*volumes).replace('nan', '-') # replacing NaN" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Example: calculate the volume-weighted average price" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "AAPL 114.5\n", "JNJ 93.0\n", "XOM 72.8\n", "dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vwap = (prices*volumes).sum()/volumes.sum()\n", "vwap.dropna()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3 Indexing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.1 Label-Based Indexing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`.loc` is strictly label based, will raise KeyError when the items are not found, allowed inputs are:\n", "\n", "* A single label\n", "* A list or array of labels [’a’, ’b’, ’c’]\n", "* A slice object with labels [’a’:’f’] (note that contrary to usual python slices, both the start and the stop are included!)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGJNJXOM
Date
2015-09-21115.2635.493.173.4
2015-09-22113.4622.793.272.7
2015-09-23114.3622.493.072.3
2015-09-24115.0625.892.572.7
2015-09-25114.7612.091.073.2
2015-09-28112.4594.991.472.6
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOG
Date
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HTML(side_by_side2(prices, prices.loc['2012-11-21':'2012-11-27',['AAPL', 'GOOG']]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Columns can be selected without specifying the index:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGJNJXOM
Date
2015-09-21115.2635.493.173.4
2015-09-22113.4622.793.272.7
2015-09-23114.3622.493.072.3
2015-09-24115.0625.892.572.7
2015-09-25114.7612.091.073.2
2015-09-28112.4594.991.472.6
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOG
Date
2015-09-21115.2635.4
2015-09-22113.4622.7
2015-09-23114.3622.4
2015-09-24115.0625.8
2015-09-25114.7612.0
2015-09-28112.4594.9
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HTML(side_by_side2(prices, prices[['AAPL', 'GOOG']]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.2 Position-Based Indexing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`.iloc` is strictly position based, will raise KeyError when the items are out of bounds:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGJNJXOM
Date
2015-09-21115.2635.493.173.4
2015-09-22113.4622.793.272.7
2015-09-23114.3622.493.072.3
2015-09-24115.0625.892.572.7
2015-09-25114.7612.091.073.2
2015-09-28112.4594.991.472.6
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOG
Date
2015-09-22113.4622.7
2015-09-23114.3622.4
2015-09-24115.0625.8
2015-09-25114.7612.0
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HTML(side_by_side2(prices, prices.iloc[1:5,[0, 1]]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "3.3 Mixed Indexing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3.3 Advanced Indexing - .ix" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`.ix` will always **try first** to resolve **labeled** index (like `.loc`), **then** it will fall back on **potitional** indexing (like `.loc`)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Rows can be indexed using the ix method. Try by yourself: \n", "\n", " prices.ix[1:4,0:2] # Position-based Indexing \n", " prices.ix[:'2012-11-23'] # Label-based Indexing on index (rows)\n", " prices.ix[:,[2,2,1]] # Duplicated values on columns \n", " prices.ix[::2] # One value every two rows\n", " prices.ix[::-1] # Reverse rows\n", " prices.ix[prices['AAPL'] > 380] # Boolean indexing on index\n", " prices.ix[:,[len(c)<4 for c in prices.columns]] # Boolean indexing on columns" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
JNJXOM
Date
2015-09-2193.173.4
2015-09-2293.272.7
2015-09-2393.072.3
2015-09-2492.572.7
2015-09-2591.073.2
2015-09-2891.472.6
\n", "
" ], "text/plain": [ " JNJ XOM\n", "Date \n", "2015-09-21 93.1 73.4\n", "2015-09-22 93.2 72.7\n", "2015-09-23 93.0 72.3\n", "2015-09-24 92.5 72.7\n", "2015-09-25 91.0 73.2\n", "2015-09-28 91.4 72.6" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prices.ix[:,[len(c)<4 for c in prices.columns]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4 DataFrame Basic Operations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.1 Reindex/Reorder rows and columns" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KW
a23
b56
c89
e1112
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WKT
b6.05.0nan
a3.02.0nan
e12.011.0nan
znannannan
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = np.array([[2,5,8,11],[3,6,9,12]])\n", "d1 = pd.DataFrame(data.T, index=list('abce'), columns=['K','W'])\n", "HTML(side_by_side2(d1, pd.DataFrame(d1, index=list('baez'), columns=['W','K','T'])) )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.2 Calculate new columns" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KWZB
a231False
b561True
c891True
e11121True
\n", "
" ], "text/plain": [ " K W Z B\n", "a 2 3 1 False\n", "b 5 6 1 True\n", "c 8 9 1 True\n", "e 11 12 1 True" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d1['Z'] = d1['W']-d1['K']\n", "d1['B'] = d1['W']>4\n", "d1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.3 Deleting rows and columns" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KWZBSUM
a231False6.0
b561True13.0
c891True19.0
e11121True25.0
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KWZBSUM
a231False6.0
e11121True25.0
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KWSUM
a236.0
b5613.0
c8919.0
e111225.0
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d1['SUM'] = d1.sum(axis=1)\n", "HTML(side_by_side2(d1, d1.drop(['b', 'c'], axis=0), d1.drop(['Z', 'B'], axis=1)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.4 Inserting colums in a specific position" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KWZBSUM
a231False6.0
b561True13.0
c891True19.0
e11121True25.0
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KExp(W)WZBSUM
a220.131False6.0
b5403.461True13.0
c88,103.191True19.0
e11162,754.8121True25.0
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d2 = d1.copy() # .copy() method is needed to create a new object.\n", "d2.insert(1, 'Exp(W)', np.exp(d1['W']))\n", "HTML(side_by_side2(d1, d2, space=10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Example: Indexing rows to create a new column with empty values, then use the Forward Fill Padding to fill the gaps" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KWZBSUMpart
a231False6.02.0
b561True13.05.0
c891True19.05.0
e11121True25.05.0
\n", "
" ], "text/plain": [ " K W Z B SUM part\n", "a 2 3 1 False 6.0 2.0\n", "b 5 6 1 True 13.0 5.0\n", "c 8 9 1 True 19.0 5.0\n", "e 11 12 1 True 25.0 5.0" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d1['part'] = d1['K'].ix[:2]\n", "d1['part'] = d1['part'].fillna(method='ffill')\n", "d1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.5 Check if a value or a list of given values are contained in a specific column" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KWZBSUMpart
a231False6.02.0
b561True13.05.0
c891True19.05.0
e11121True25.05.0
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
K
aFalse
bFalse
cTrue
eFalse
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# TODO: Upgrade side_by_side2 to include series\n", "HTML(side_by_side2(d1, d1['K'].isin([3, 8])))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.6 Rename columns" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KWZBSUMpart
a231False6.02.0
b561True13.05.0
c891True19.05.0
e11121True25.05.0
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ONETWOTHREEBSUMpart
a231False6.02.0
b561True13.05.0
c891True19.05.0
e11121True25.05.0
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "HTML(side_by_side2(d1, d1.rename(columns={'K':'ONE','W':'TWO','Z':'THREE'})))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4.7 Iterate efficiently through rows" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`iterrows` returns an iterator yielding each index value along with a Series containing the data in each row:" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a ** 2 - 3 - 1 - False - 6.0 - 2.0\n", "b ** 5 - 6 - 1 - True - 13.0 - 5.0\n", "c ** 8 - 9 - 1 - True - 19.0 - 5.0\n", "e ** 11 - 12 - 1 - True - 25.0 - 5.0\n" ] } ], "source": [ "for row_index, row in d1.iterrows():\n", " print(row_index, '**', ' - '.join([str(item) for item in row]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`itertuples` returns an iterator yielding a tuple for each row in the DataFrame. The first element of the tuple is the row’s corresponding index value, while the remaining elements are the row values:" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('a', 2, 3, 1, False, 6.0, 2.0)\n", "('b', 5, 6, 1, True, 13.0, 5.0)\n", "('c', 8, 9, 1, True, 19.0, 5.0)\n", "('e', 11, 12, 1, True, 25.0, 5.0)\n" ] } ], "source": [ "for t in d1.itertuples():\n", " print(t)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5 Duplicated Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.1 Find duplicated data in columns" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abca dupa+b dupa+b dup - take last
0onex-0.3FalseFalseFalse
1oney-0.1TrueFalseFalse
2twoy1.2FalseFalseTrue
3threex-0.5FalseFalseFalse
4twoy1.3TrueTrueFalse
\n", "
" ], "text/plain": [ " a b c a dup a+b dup a+b dup - take last\n", "0 one x -0.3 False False False\n", "1 one y -0.1 True False False\n", "2 two y 1.2 False False True\n", "3 three x -0.5 False False False\n", "4 two y 1.3 True True False" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d3 = pd.read_csv('temp/p01_d2.csv', index_col=0)\n", "d3['a dup'] = d3.duplicated(['a'])\n", "d3['a+b dup'] = d3.duplicated(['a', 'b'])\n", "d3['a+b dup - take last'] = d3.duplicated(['a', 'b'], keep='last')\n", "d3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.2 Remove Duplicates" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abca dupa+b dupa+b dup - take last
0onex-0.3FalseFalseFalse
1oney-0.1TrueFalseFalse
3threex-0.5FalseFalseFalse
4twoy1.3TrueTrueFalse
\n", "
" ], "text/plain": [ " a b c a dup a+b dup a+b dup - take last\n", "0 one x -0.3 False False False\n", "1 one y -0.1 True False False\n", "3 three x -0.5 False False False\n", "4 two y 1.3 True True False" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d3.drop_duplicates(['a', 'b'],keep='last')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6 Working with Large Arrays" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.1 Control the DataFrame memory occupation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's start by generating a DataFrame from a Numpy Array. We'll see than there is no memory overhead on DataFrame Values:" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows x Cols x 8: 80000\n", "np Array Memory Occupation: 80000\n", "Dataframe Values Memory Occupation: 80000\n", "Dataframe Index Memory Occupation: 800\n", "Dataframe Columns Memory Occupation: 800\n" ] } ], "source": [ "rows, cols = 100, 100\n", "np_array = np.array(np.random.randn(rows, cols), dtype=np.float64)\n", "d4 = pd.DataFrame(np_array)\n", "print ('Rows x Cols x 8: ', rows*cols*8)\n", "print ('np Array Memory Occupation: ', np_array.nbytes)\n", "print ('Dataframe Values Memory Occupation: ', d4.values.nbytes)\n", "print ('Dataframe Index Memory Occupation: ', d4.index.nbytes)\n", "print ('Dataframe Columns Memory Occupation: ', d4.columns.nbytes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To reduce the memory occupation it's possible to change the value's dtype:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataframe Values Memory Occupation: 20000\n" ] } ], "source": [ "d4 = d4.astype(dtype=np.float16)\n", "print ('Dataframe Values Memory Occupation: ', d4.values.nbytes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If the data is sparse the Dataframe can be sparsified as well to save further resources with the `to_sparse()` method:" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataframe Values Memory Occupation: 20000\n", "Dataframe Values Memory Occupation: 80000\n" ] } ], "source": [ "d4.ix[2:,4:] = np.nan\n", "print ('Dataframe Values Memory Occupation: ', d4.values.nbytes)\n", "d4 = d4.to_sparse()\n", "print ('Dataframe Values Memory Occupation: ', d4.values.nbytes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this case rows and colums are np.int64 arrays:" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", " 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,\n", " 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,\n", " 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,\n", " 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,\n", " 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],\n", " dtype='int64')" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d4.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 6.2 Explore large arrays" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Working with large arrays: in Excel is difficult to explore arrays with thousands of lines and columns. Explore the pandas capabilities with the following code. The first line visualize the firts two lines, while the second actually load the whole file. Try to do the same in Excel for comparison." ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [], "source": [ "d3 = pd.read_csv('example_data/p01_d3.csv.gz', compression='gzip')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Istat - Comune - Provincia - Regione - Prefisso - CAP - CodFisco - Abitanti - Link - " ] } ], "source": [ "for col in d3.columns: \n", " print (col, end=' - ')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Try by yourself: \n", "\n", " d3.head()\n", " d3[d3.columns[:3]].head()\n", " d3[d3.columns[-4:-1]].tail()\n", " d3.ix[1000:1010, :7]\n", " d3.ix[:, 'Abitanti'].describe()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CAPCodFiscoAbitanti
808733020M200607
808813848M2011152
808987040M2022413
809083030M2031232
809189867M2042055
\n", "
" ], "text/plain": [ " CAP CodFisco Abitanti\n", "8087 33020 M200 607\n", "8088 13848 M201 1152\n", "8089 87040 M202 2413\n", "8090 83030 M203 1232\n", "8091 89867 M204 2055" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d3[d3.columns[-4:-1]].tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7 Column pct_change and shift" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is possibile to add multiple new columns to a `DataFrame`." ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourfive
03.08.03.01.0nan
15.016.06.02.0nan
27.028.0nan4.0nan
310.037.0nan7.0nan
413.045.015.011.016.0
516.057.018.016.019.0
656.069.0nan65.082.0
772.090.0nan88.091.0
\n", "
" ], "text/plain": [ " one two three four five\n", "0 3.0 8.0 3.0 1.0 nan\n", "1 5.0 16.0 6.0 2.0 nan\n", "2 7.0 28.0 nan 4.0 nan\n", "3 10.0 37.0 nan 7.0 nan\n", "4 13.0 45.0 15.0 11.0 16.0\n", "5 16.0 57.0 18.0 16.0 19.0\n", "6 56.0 69.0 nan 65.0 82.0\n", "7 72.0 90.0 nan 88.0 91.0" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = np.array([[3, 5, 7, 10, 13, 16, 56, 72],\n", " [8, 16, 28, 37, 45, 57, 69, 90],\n", " [3, 6, NaN, NaN, 15, 18, NaN, NaN],\n", " [1, 2, 4, 7, 11, 16, 65, 88],\n", " [NaN, NaN, NaN, NaN, 16, 19, 82, 91]])\n", "d4 = pd.DataFrame(data.T, columns=['one', 'two', 'three', 'four', 'five'])\n", "d4" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourfiveone rettwo ret
03.08.03.01.0nannannan
15.016.06.02.0nan1.72.0
27.028.0nan4.0nan1.41.8
310.037.0nan7.0nan1.41.3
413.045.015.011.016.01.31.2
516.057.018.016.019.01.21.3
656.069.0nan65.082.03.51.2
772.090.0nan88.091.01.31.3
\n", "
" ], "text/plain": [ " one two three four five one ret two ret\n", "0 3.0 8.0 3.0 1.0 nan nan nan\n", "1 5.0 16.0 6.0 2.0 nan 1.7 2.0\n", "2 7.0 28.0 nan 4.0 nan 1.4 1.8\n", "3 10.0 37.0 nan 7.0 nan 1.4 1.3\n", "4 13.0 45.0 15.0 11.0 16.0 1.3 1.2\n", "5 16.0 57.0 18.0 16.0 19.0 1.2 1.3\n", "6 56.0 69.0 nan 65.0 82.0 3.5 1.2\n", "7 72.0 90.0 nan 88.0 91.0 1.3 1.3" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d4[['one ret','two ret']] = d4[['one','two']].pct_change()+1\n", "d4" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourfiveone rettwo retfour var
03.08.03.01.0nannannannan
15.016.06.02.0nan1.72.00.0
27.028.0nan4.0nan1.41.80.7
310.037.0nan7.0nan1.41.31.1
413.045.015.011.016.01.31.21.4
516.057.018.016.019.01.21.31.6
656.069.0nan65.082.03.51.23.9
772.090.0nan88.091.01.31.33.1
\n", "
" ], "text/plain": [ " one two three four five one ret two ret four var\n", "0 3.0 8.0 3.0 1.0 nan nan nan nan\n", "1 5.0 16.0 6.0 2.0 nan 1.7 2.0 0.0\n", "2 7.0 28.0 nan 4.0 nan 1.4 1.8 0.7\n", "3 10.0 37.0 nan 7.0 nan 1.4 1.3 1.1\n", "4 13.0 45.0 15.0 11.0 16.0 1.3 1.2 1.4\n", "5 16.0 57.0 18.0 16.0 19.0 1.2 1.3 1.6\n", "6 56.0 69.0 nan 65.0 82.0 3.5 1.2 3.9\n", "7 72.0 90.0 nan 88.0 91.0 1.3 1.3 3.1" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d4['four var'] = np.log(d4['four'] - d4['four'].shift())\n", "d4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8 Reindex" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`DataFrame.reindex` method conforms a `DataFrame` to a new index, filling cells with no values. It is possible to use this method to rearrange columns." ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
oneone rettwotwo retfourfour var
03.0nan8.0nan1.0nan
15.01.716.02.02.00.0
27.01.428.01.84.00.7
310.01.437.01.37.01.1
413.01.345.01.211.01.4
516.01.257.01.316.01.6
656.03.569.01.265.03.9
772.01.390.01.388.03.1
\n", "
" ], "text/plain": [ " one one ret two two ret four four var\n", "0 3.0 nan 8.0 nan 1.0 nan\n", "1 5.0 1.7 16.0 2.0 2.0 0.0\n", "2 7.0 1.4 28.0 1.8 4.0 0.7\n", "3 10.0 1.4 37.0 1.3 7.0 1.1\n", "4 13.0 1.3 45.0 1.2 11.0 1.4\n", "5 16.0 1.2 57.0 1.3 16.0 1.6\n", "6 56.0 3.5 69.0 1.2 65.0 3.9\n", "7 72.0 1.3 90.0 1.3 88.0 3.1" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d5 = d4.reindex(columns=['one','one ret','two','two ret','four','four var'])\n", "d5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice that `DataFrame.reindex` gives a new view, hence `d4` isn't changed." ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefourfiveone rettwo retfour var
03.08.03.01.0nannannannan
15.016.06.02.0nan1.72.00.0
27.028.0nan4.0nan1.41.80.7
310.037.0nan7.0nan1.41.31.1
413.045.015.011.016.01.31.21.4
516.057.018.016.019.01.21.31.6
656.069.0nan65.082.03.51.23.9
772.090.0nan88.091.01.31.33.1
\n", "
" ], "text/plain": [ " one two three four five one ret two ret four var\n", "0 3.0 8.0 3.0 1.0 nan nan nan nan\n", "1 5.0 16.0 6.0 2.0 nan 1.7 2.0 0.0\n", "2 7.0 28.0 nan 4.0 nan 1.4 1.8 0.7\n", "3 10.0 37.0 nan 7.0 nan 1.4 1.3 1.1\n", "4 13.0 45.0 15.0 11.0 16.0 1.3 1.2 1.4\n", "5 16.0 57.0 18.0 16.0 19.0 1.2 1.3 1.6\n", "6 56.0 69.0 nan 65.0 82.0 3.5 1.2 3.9\n", "7 72.0 90.0 nan 88.0 91.0 1.3 1.3 3.1" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 9 More on Indexing: Multi Index" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
CountryNumberDir
Fraonex-0.10.3-0.2
twoy0.3-0.90.3
z1.81.0-1.4
Geronex-0.7-0.50.4
Japonex1.21.2-0.2
twox-0.40.5-0.6
USAoney-1.9-0.90.5
z-0.1-1.00.7
\n", "
" ], "text/plain": [ " 0 1 2\n", "Country Number Dir \n", "Fra one x -0.1 0.3 -0.2\n", " two y 0.3 -0.9 0.3\n", " z 1.8 1.0 -1.4\n", "Ger one x -0.7 -0.5 0.4\n", "Jap one x 1.2 1.2 -0.2\n", " two x -0.4 0.5 -0.6\n", "USA one y -1.9 -0.9 0.5\n", " z -0.1 -1.0 0.7" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d6 = pd.read_csv('temp/p01_d4.csv', index_col=['Country',\n", " 'Number',\n", " 'Dir'])\n", "d6 = d6.sortlevel()\n", "d6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Try by yourself:\n", "\n", " d6.ix['Fra']\n", " d6.ix['Fra', 'two']\n", " d6.ix['Fra':'Ger']\n", " d6.reorder_levels([2,1,0], axis=0).sortlevel(0)\n", " d6.reset_index(level=1)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
NumberDir
onex-0.10.3-0.2
twoy0.3-0.90.3
z1.81.0-1.4
\n", "
" ], "text/plain": [ " 0 1 2\n", "Number Dir \n", "one x -0.1 0.3 -0.2\n", "two y 0.3 -0.9 0.3\n", " z 1.8 1.0 -1.4" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d6.ix['Fra'] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 10 Package Options" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The way the DataFrames are displayed can be customized ijn many ways: ([See documentation](http://pandas.pydata.org/pandas-docs/stable/basics.html#working-with-package-options)):" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Display Max Rows: \t 60\n", "display.max_rows : int\n", " If max_rows is exceeded, switch to truncate view. Depending on\n", " `large_repr`, objects are either centrally truncated or printed as\n", " a summary view. 'None' value means unlimited.\n", "\n", " In case python/IPython is running in a terminal and `large_repr`\n", " equals 'truncate' this can be set to 0 and pandas will auto-detect\n", " the height of the terminal and print a truncated object which fits\n", " the screen height. The IPython notebook, IPython qtconsole, or\n", " IDLE do not run in a terminal and hence it is not possible to do\n", " correct auto-detection.\n", " [default: 60] [currently: 60]\n", "\n", "\n" ] } ], "source": [ "print ('Display Max Rows: \\t', pd.get_option('display.max_rows'))\n", "pd.describe_option('display.max_rows')" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "10" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option('display.max_rows', 10)\n", "pd.get_option('display.max_rows')" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "60" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.reset_option('display.max_rows')\n", "pd.get_option('display.max_rows')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Try by yourself**:\n", "\n", " pd.describe_option('display.chop_threshold')\n", " pd.describe_option('display.colheader_justify')\n", " pd.describe_option('display.column_space')\n", " pd.describe_option('display.date_dayfirst')\n", " pd.describe_option('display.date_yearfirst')\n", " pd.describe_option('display.encoding')\n", " pd.describe_option('display.expand_frame_repr')\n", " pd.describe_option('display.float_format')\n", " pd.describe_option('display.max_columns')\n", " pd.describe_option('display.max_colwidth')\n", " pd.describe_option('display.max_rows')\n", " pd.describe_option('display.notebook_repr_html')\n", " pd.describe_option('display.precision')\n", " # for more options see documentation..." ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ " AAPL GOOG JNJ XOM\n", "Date \n", "2015-09-21 115.2 635.4 93.1 73.4\n", "2015-09-22 113.4 622.7 93.2 72.7\n", "2015-09-23 114.3 622.4 93.0 72.3\n", "2015-09-24 115.0 625.8 92.5 72.7\n", "2015-09-25 114.7 612.0 91.0 73.2\n", "2015-09-28 112.4 594.9 91.4 72.6" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option('display.precision', 2)\n", "pd.set_option('display.notebook_repr_html', False)\n", "prices" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGJNJXOM
Date
2015-09-21115.2635.493.173.4
2015-09-22113.4622.793.272.7
2015-09-23114.3622.493.072.3
2015-09-24115.0625.892.572.7
2015-09-25114.7612.091.073.2
2015-09-28112.4594.991.472.6
\n", "
" ], "text/plain": [ " AAPL GOOG JNJ XOM\n", "Date \n", "2015-09-21 115.2 635.4 93.1 73.4\n", "2015-09-22 113.4 622.7 93.2 72.7\n", "2015-09-23 114.3 622.4 93.0 72.3\n", "2015-09-24 115.0 625.8 92.5 72.7\n", "2015-09-25 114.7 612.0 91.0 73.2\n", "2015-09-28 112.4 594.9 91.4 72.6" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option('display.notebook_repr_html', True)\n", "prices" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "Visit [www.add-for.com]() for more tutorials and updates.\n", "\n", "This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }