{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# import\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],\n", " [np.nan, np.nan], [0.75, -1.3]],\n", " index=['a', 'b', 'c', 'd'],\n", " columns=['one', 'two'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
a1.40NaN
b7.10-4.5
cNaNNaN
d0.75-1.3
\n", "
" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 7.10 -4.5\n", "c NaN NaN\n", "d 0.75 -1.3" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one 9.25\n", "two -5.80\n", "dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# printning sum of df columns\n", "df.sum()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one 3.083333\n", "two -2.900000\n", "dtype: float64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ " df.mean()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 1.40\n", "b 2.60\n", "c 0.00\n", "d -0.55\n", "dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum(axis=1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a NaN\n", "b 2.60\n", "c NaN\n", "d -0.55\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# NA values are excluded unless the entire slice is NA. This can be disabled using the skipna option\n", "df.sum(axis=1, skipna=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Options for reduction method\n", "```\n", "axis Axis to reduce over. 0 for DataFrame’s rows and 1 for columns.\n", "skipna Exclude missing values, True by default.\n", "level Reduce grouped by level if the axis is hierarchically-indexed (MultiIndex).\n", "```" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one d\n", "two b\n", "dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained\n", "df.idxmin()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
a1.40NaN
b8.50-4.5
cNaNNaN
d9.25-5.8
\n", "
" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 8.50 -4.5\n", "c NaN NaN\n", "d 9.25 -5.8" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# for cumulative sum\n", "df.cumsum()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\tools\\Anaconda3\\lib\\site-packages\\numpy\\lib\\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile\n", " RuntimeWarning)\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
count3.0000002.000000
mean3.083333-2.900000
std3.4936852.262742
min0.750000-4.500000
25%NaNNaN
50%NaNNaN
75%NaNNaN
max7.100000-1.300000
\n", "
" ], "text/plain": [ " one two\n", "count 3.000000 2.000000\n", "mean 3.083333 -2.900000\n", "std 3.493685 2.262742\n", "min 0.750000 -4.500000\n", "25% NaN NaN\n", "50% NaN NaN\n", "75% NaN NaN\n", "max 7.100000 -1.300000" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# describing \n", "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Descriptive and summary statistics\n", "```\n", "count Number of non-NA values\n", "describe Compute set of summary statistics for Series or each DataFrame column\n", "min, max Compute minimum and maximum values\n", "argmin, argmax Compute index locations (integers) at which minimum or maximum value obtained, respectively\n", "idxmin, idxmax Compute index values at which minimum or maximum value obtained, respectively\n", "quantile Compute sample quantile ranging from 0 to 1\n", "sum Sum of values\n", "mean Mean of values\n", "median Arithmetic median (50% quantile) of values\n", "mad Mean absolute deviation from mean value\n", "var Sample variance of values\n", "std Sample standard deviation of values\n", "skew Sample skewness (3rd moment) of values\n", "kurt Sample kurtosis (4th moment) of values\n", "cumsum Cumulative sum of values\n", "cummin, cummax Cumulative minimum or maximum of values, respectively\n", "cumprod Cumulative product of values\n", "diff Compute 1st arithmetic difference (useful for time series)\n", "pct_change Compute percent changes\n", "```" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\tools\\Anaconda3\\lib\\site-packages\\numpy\\lib\\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile\n", " RuntimeWarning)\n" ] }, { "data": { "text/plain": [ "one NaN\n", "two NaN\n", "dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.quantile()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 a\n", "1 a\n", "2 b\n", "3 c\n", "4 a\n", "5 a\n", "6 b\n", "7 c\n", "8 a\n", "9 a\n", "10 b\n", "11 c\n", "12 a\n", "13 a\n", "14 b\n", "15 c\n", "dtype: object\n" ] } ], "source": [ "obj = pd.Series(['a', 'a', 'b', 'c'] * 4)\n", "print(obj)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "count 16\n", "unique 3\n", "top a\n", "freq 8\n", "dtype: object" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Correlation and Covariance" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\data.py:35: FutureWarning: \n", "The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.\n", "After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.\n", " FutureWarning)\n" ] } ], "source": [ "import pandas.io.data as web\n", "\n", "all_data = {}\n", "\n", "for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:\n", " all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')\n", " price = pd.DataFrame({tic: data['Adj Close']\n", " for tic, data in all_data.items()})\n", " \n", "volume = pd.DataFrame({tic: data['Volume']\n", " for tic, data in all_data.items()})" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "returns = price.pct_change()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGIBMMSFT
Date
2009-12-240.0343390.0111170.0043850.002587
2009-12-280.0122940.0070980.0133260.005484
2009-12-29-0.011861-0.005571-0.0034770.007058
2009-12-300.0121470.0053760.005461-0.013699
2009-12-31-0.004300-0.004416-0.012597-0.015504
\n", "
" ], "text/plain": [ " AAPL GOOG IBM MSFT\n", "Date \n", "2009-12-24 0.034339 0.011117 0.004385 0.002587\n", "2009-12-28 0.012294 0.007098 0.013326 0.005484\n", "2009-12-29 -0.011861 -0.005571 -0.003477 0.007058\n", "2009-12-30 0.012147 0.005376 0.005461 -0.013699\n", "2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.tail()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.4959796261031541" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.MSFT.corr(returns.IBM)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.00021595760076743132" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ " returns.MSFT.cov(returns.IBM)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGIBMMSFT
AAPL1.0000000.4706760.4100110.424305
GOOG0.4706761.0000000.3906890.443587
IBM0.4100110.3906891.0000000.495980
MSFT0.4243050.4435870.4959801.000000
\n", "
" ], "text/plain": [ " AAPL GOOG IBM MSFT\n", "AAPL 1.000000 0.470676 0.410011 0.424305\n", "GOOG 0.470676 1.000000 0.390689 0.443587\n", "IBM 0.410011 0.390689 1.000000 0.495980\n", "MSFT 0.424305 0.443587 0.495980 1.000000" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.corr()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGIBMMSFT
AAPL0.0010270.0003030.0002520.000309
GOOG0.0003030.0005800.0001420.000205
IBM0.0002520.0001420.0003670.000216
MSFT0.0003090.0002050.0002160.000516
\n", "
" ], "text/plain": [ " AAPL GOOG IBM MSFT\n", "AAPL 0.001027 0.000303 0.000252 0.000309\n", "GOOG 0.000303 0.000580 0.000142 0.000205\n", "IBM 0.000252 0.000142 0.000367 0.000216\n", "MSFT 0.000309 0.000205 0.000216 0.000516" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ " returns.cov()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "AAPL 0.410011\n", "GOOG 0.390689\n", "IBM 1.000000\n", "MSFT 0.495980\n", "dtype: float64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.corrwith(returns.IBM)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "AAPL -0.057549\n", "GOOG 0.062647\n", "IBM -0.007892\n", "MSFT -0.014245\n", "dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.corrwith(volume)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Unique Values, Value Counts, and Membership" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 c\n", "1 a\n", "2 d\n", "3 a\n", "4 a\n", "5 b\n", "6 b\n", "7 c\n", "8 c\n", "dtype: object\n" ] } ], "source": [ "obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])\n", "print(obj)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['c', 'a', 'd', 'b'], dtype=object)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.unique()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 3\n", "c 3\n", "b 2\n", "d 1\n", "dtype: int64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ " obj.value_counts()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "c 3\n", "d 1\n", "b 2\n", "a 3\n", "dtype: int64" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.value_counts(obj.values, sort=False)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 True\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 True\n", "6 True\n", "7 True\n", "8 True\n", "dtype: bool\n" ] } ], "source": [ "mask = obj.isin(['b', 'c'])\n", "print(mask)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 c\n", "5 b\n", "6 b\n", "7 c\n", "8 c\n", "dtype: object\n" ] } ], "source": [ "print(obj[mask])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Unique, value counts, and binning method\n", "```\n", "isin Compute boolean array indicating whether each Series value is contained in the passed sequence of values.\n", "unique Compute array of unique values in a Series, returned in the order observed.\n", "value_counts Return a Series containing unique values as its index and frequencies as its values, ordered count in\n", "descending order.\n", "```" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Qu1 Qu2 Qu3\n", "0 1 2 1\n", "1 3 3 5\n", "2 4 1 2\n", "3 3 2 4\n", "4 4 3 4\n" ] } ], "source": [ "data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],\n", " 'Qu2': [2, 3, 1, 2, 3],\n", " 'Qu3': [1, 5, 2, 4, 4]})\n", "print(data)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Qu1Qu2Qu3
11.01.01.0
2NaN2.01.0
32.02.0NaN
42.0NaN2.0
5NaNNaN1.0
\n", "
" ], "text/plain": [ " Qu1 Qu2 Qu3\n", "1 1.0 1.0 1.0\n", "2 NaN 2.0 1.0\n", "3 2.0 2.0 NaN\n", "4 2.0 NaN 2.0\n", "5 NaN NaN 1.0" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = data.apply(pd.value_counts)\n", "result" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Qu1Qu2Qu3
11.01.01.0
20.02.01.0
32.02.00.0
42.00.02.0
50.00.01.0
\n", "
" ], "text/plain": [ " Qu1 Qu2 Qu3\n", "1 1.0 1.0 1.0\n", "2 0.0 2.0 1.0\n", "3 2.0 2.0 0.0\n", "4 2.0 0.0 2.0\n", "5 0.0 0.0 1.0" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result.fillna(0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Handling Missing Data" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 aardvark\n", "1 artichoke\n", "2 NaN\n", "3 avocado\n", "dtype: object\n" ] } ], "source": [ "string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])\n", "print(string_data)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 True\n", "3 False\n", "dtype: bool" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string_data.isnull()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 False\n", "2 True\n", "3 False\n", "dtype: bool" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string_data[0] = None # None is also treated as NaN\n", "string_data.isnull()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### NA handling methods\n", "```\n", "dropna Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.\n", "fillna Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.\n", "isnull Return like-type object containing boolean values indicating which values are missing / NA.\n", "notnull Negation of isnull.\n", "``` " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filtering Out Missing Data" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 1.0\n", "1 NaN\n", "2 3.5\n", "3 NaN\n", "4 7.0\n", "dtype: float64\n" ] } ], "source": [ "from numpy import nan as NA\n", "\n", "data = pd.Series([1, NA, 3.5, NA, 7])\n", "print(data)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "2 3.5\n", "4 7.0\n", "dtype: float64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna() # dropping na records" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "2 3.5\n", "4 7.0\n", "dtype: float64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# same can be achieved as binary filtering\n", "data[data.notnull()]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 1 2\n", "0 1.0 6.5 3.0\n", "1 1.0 NaN NaN\n", "2 NaN NaN NaN\n", "3 NaN 6.5 3.0\n" ] } ], "source": [ "data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],\n", " [NA, NA, NA], [NA, 6.5, 3.]])\n", "print(data)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.06.53.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.0 6.5 3.0" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.06.53.0
11.0NaNNaN
3NaN6.53.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.0 6.5 3.0\n", "1 1.0 NaN NaN\n", "3 NaN 6.5 3.0" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna(how='all') # this will remove only those records which has NaN in all columns" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 1 2 4\n", "0 1.0 6.5 3.0 NaN\n", "1 1.0 NaN NaN NaN\n", "2 NaN NaN NaN NaN\n", "3 NaN 6.5 3.0 NaN\n" ] } ], "source": [ "# can drop the columns same way with axis = 1\n", "data[4] = NA\n", "print(data)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.06.53.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.0 6.5 3.0\n", "1 1.0 NaN NaN\n", "2 NaN NaN NaN\n", "3 NaN 6.5 3.0" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna(how='all', axis=1)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.06.53.0
11.0NaNNaN
3NaN6.53.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.0 6.5 3.0\n", "1 1.0 NaN NaN\n", "3 NaN 6.5 3.0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna(how='all', axis=1).dropna(how='all')" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 1 2\n", "0 -1.030615 0.602560 -0.387511\n", "1 2.186335 -2.965361 0.432411\n", "2 -0.392753 -1.305728 0.178472\n", "3 -1.287533 0.233790 1.783785\n", "4 0.012547 -0.827266 0.196979\n", "5 -0.027398 -0.616937 0.645708\n", "6 1.407688 0.178475 3.133043\n" ] } ], "source": [ "df = pd.DataFrame(np.random.randn(7, 3))\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 1 2\n", "0 -1.030615 NaN NaN\n", "1 2.186335 NaN NaN\n", "2 -0.392753 NaN NaN\n", "3 -1.287533 NaN 1.783785\n", "4 0.012547 NaN 0.196979\n", "5 -0.027398 -0.616937 0.645708\n", "6 1.407688 0.178475 3.133043\n" ] } ], "source": [ "df.ix[:4, 1] = NA; \n", "df.ix[:2, 2] = NA;\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
3-1.287533NaN1.783785
40.012547NaN0.196979
5-0.027398-0.6169370.645708
61.4076880.1784753.133043
\n", "
" ], "text/plain": [ " 0 1 2\n", "3 -1.287533 NaN 1.783785\n", "4 0.012547 NaN 0.196979\n", "5 -0.027398 -0.616937 0.645708\n", "6 1.407688 0.178475 3.133043" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dropna(thresh=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filling in Missing Data" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0124
01.06.53.0999.0
11.0999.0999.0999.0
2999.0999.0999.0999.0
3999.06.53.0999.0
\n", "
" ], "text/plain": [ " 0 1 2 4\n", "0 1.0 6.5 3.0 999.0\n", "1 1.0 999.0 999.0 999.0\n", "2 999.0 999.0 999.0 999.0\n", "3 999.0 6.5 3.0 999.0" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.fillna(999) # replacing na with some value" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
0-1.0306150.0000000.000000
12.1863350.0000000.000000
2-0.3927530.0000000.000000
3-1.2875330.0000001.783785
40.0125470.0000000.196979
5-0.027398-0.6169370.645708
61.4076880.1784753.133043
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 -1.030615 0.000000 0.000000\n", "1 2.186335 0.000000 0.000000\n", "2 -0.392753 0.000000 0.000000\n", "3 -1.287533 0.000000 1.783785\n", "4 0.012547 0.000000 0.196979\n", "5 -0.027398 -0.616937 0.645708\n", "6 1.407688 0.178475 3.133043" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
0-1.0306150.500000-1.000000
12.1863350.500000-1.000000
2-0.3927530.500000-1.000000
3-1.2875330.5000001.783785
40.0125470.5000000.196979
5-0.027398-0.6169370.645708
61.4076880.1784753.133043
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 -1.030615 0.500000 -1.000000\n", "1 2.186335 0.500000 -1.000000\n", "2 -0.392753 0.500000 -1.000000\n", "3 -1.287533 0.500000 1.783785\n", "4 0.012547 0.500000 0.196979\n", "5 -0.027398 -0.616937 0.645708\n", "6 1.407688 0.178475 3.133043" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna({1: 0.5, 2: -1})" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
0-1.030615NaNNaN
12.186335NaNNaN
2-0.392753NaNNaN
3-1.287533NaN1.783785
40.012547NaN0.196979
5-0.027398-0.6169370.645708
61.4076880.1784753.133043
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 -1.030615 NaN NaN\n", "1 2.186335 NaN NaN\n", "2 -0.392753 NaN NaN\n", "3 -1.287533 NaN 1.783785\n", "4 0.012547 NaN 0.196979\n", "5 -0.027398 -0.616937 0.645708\n", "6 1.407688 0.178475 3.133043" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fillna returns a new object, but you can modify the existing object in place\n", "df" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
0-1.0306150.0000000.000000
12.1863350.0000000.000000
2-0.3927530.0000000.000000
3-1.2875330.0000001.783785
40.0125470.0000000.196979
5-0.027398-0.6169370.645708
61.4076880.1784753.133043
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 -1.030615 0.000000 0.000000\n", "1 2.186335 0.000000 0.000000\n", "2 -0.392753 0.000000 0.000000\n", "3 -1.287533 0.000000 1.783785\n", "4 0.012547 0.000000 0.196979\n", "5 -0.027398 -0.616937 0.645708\n", "6 1.407688 0.178475 3.133043" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "_ = df.fillna(0, inplace=True) # modify the existing object in place\n", "df" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.2507920.4309611.018721
1-0.681740-0.679749-0.765389
2-2.656496NaN-1.899419
30.879665NaN-0.637943
40.698958NaNNaN
5-1.493191NaNNaN
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.250792 0.430961 1.018721\n", "1 -0.681740 -0.679749 -0.765389\n", "2 -2.656496 NaN -1.899419\n", "3 0.879665 NaN -0.637943\n", "4 0.698958 NaN NaN\n", "5 -1.493191 NaN NaN" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.random.randn(6, 3))\n", "df.ix[2:, 1] = NA\n", "df.ix[4:, 2] = NA\n", "df" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.2507920.4309611.018721
1-0.681740-0.679749-0.765389
2-2.656496-0.679749-1.899419
30.879665-0.679749-0.637943
40.698958-0.679749-0.637943
5-1.493191-0.679749-0.637943
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.250792 0.430961 1.018721\n", "1 -0.681740 -0.679749 -0.765389\n", "2 -2.656496 -0.679749 -1.899419\n", "3 0.879665 -0.679749 -0.637943\n", "4 0.698958 -0.679749 -0.637943\n", "5 -1.493191 -0.679749 -0.637943" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna(method='ffill') # repeat the last value of columns into NaN field" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.2507920.4309611.018721
1-0.681740-0.679749-0.765389
2-2.656496-0.679749-1.899419
30.879665-0.679749-0.637943
40.698958NaN-0.637943
5-1.493191NaN-0.637943
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.250792 0.430961 1.018721\n", "1 -0.681740 -0.679749 -0.765389\n", "2 -2.656496 -0.679749 -1.899419\n", "3 0.879665 -0.679749 -0.637943\n", "4 0.698958 NaN -0.637943\n", "5 -1.493191 NaN -0.637943" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna(method='ffill', limit=2) # limit the fill" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.2507920.4309611.018721
1-0.681740-0.679749-0.765389
2-2.656496-0.124394-1.899419
30.879665-0.124394-0.637943
40.698958-0.124394-0.571007
5-1.493191-0.124394-0.571007
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.250792 0.430961 1.018721\n", "1 -0.681740 -0.679749 -0.765389\n", "2 -2.656496 -0.124394 -1.899419\n", "3 0.879665 -0.124394 -0.637943\n", "4 0.698958 -0.124394 -0.571007\n", "5 -1.493191 -0.124394 -0.571007" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna(df.mean())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### fillna function arguments\n", "```\n", "value Scalar value or dict-like object to use to fill missing values\n", "method Interpolation, by default 'ffill' if function called with no other arguments\n", "axis Axis to fill on, default axis=0\n", "inplace Modify the calling object without producing a copy\n", "limit For forward and backward filling, maximum number of consecutive periods to fill\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }