{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# import\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Possible data inputs to DataFrame constructor\n", "```\n", "2D ndarray A matrix of data, passing optional row and column labels\n", "dict of arrays, lists, or tuples Each sequence becomes a column in the DataFrame. All sequences must be the same length.\n", "NumPy structured/record array Treated as the “dict of arrays” case\n", "dict of Series Each value becomes a column. Indexes from each Series are unioned together to form the result’s row index if no explicit index is passed.\n", "dict of dicts Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case.\n", "list of dicts or Series Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels\n", "List of lists or tuples Treated as the “2D ndarray” case\n", "Another DataFrame The DataFrame’s indexes are used unless different ones are passed\n", "NumPy MaskedArray Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result\n", "```" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " \n" ] } ], "source": [ "state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada']\n", "year = [2000, 2001, 2002, 2001, 2002]\n", "pop = [1.5, 1.7, 3.6, 2.4, 2.9]\n", "\n", "print(type(state), type(year), type(pop))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5 entries, 0 to 4\n", "Data columns (total 3 columns):\n", "pop 5 non-null float64\n", "state 5 non-null object\n", "year 5 non-null int64\n", "dtypes: float64(1), int64(1), object(1)\n", "memory usage: 200.0+ bytes\n", "None\n" ] } ], "source": [ "# creating dataframe\n", "df = pd.DataFrame({'state':state, 'year':year, 'pop':pop})\n", "print(df.info())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " pop state year\n", "0 1.5 Ohio 2000\n", "1 1.7 Ohio 2001\n", "2 3.6 Ohio 2002\n", "3 2.4 Nevada 2001\n", "4 2.9 Nevada 2002\n" ] } ], "source": [ "print(df)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'year': [2000, 2001, 2002, 2001, 2002], 'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]} \n", " \n" ] } ], "source": [ "sdata = {'state':state, 'year':year, 'pop':pop}\n", "print(sdata,\"\\n\",type(sdata))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Empty DataFrame\n", "Columns: [pop1, state1, year1]\n", "Index: []\n" ] } ], "source": [ "df = pd.DataFrame(sdata, columns=['pop1', 'state1', 'year1']) # we can not rename columns like this, but create column names \n", " # if doesn't exists\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " pop1 state year\n", "0 NaN Ohio 2000\n", "1 NaN Ohio 2001\n", "2 NaN Ohio 2002\n", "3 NaN Nevada 2001\n", "4 NaN Nevada 2002\n" ] } ], "source": [ "df = pd.DataFrame(sdata, columns=['pop1', 'state', 'year']) # this will pick those columns from sdata which matched\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['pop', 'state', 'year'], dtype='object')\n" ] } ], "source": [ "df = pd.DataFrame(sdata)\n", "print(df.columns)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " pop1 state1 year1\n", "one 1.5 Ohio 2000\n", "two 1.7 Ohio 2001\n", "three 3.6 Ohio 2002\n", "four 2.4 Nevada 2001\n", "five 2.9 Nevada 2002\n" ] } ], "source": [ "# renaming columns and index\n", "df.columns = ['pop1', 'state1', 'year1']\n", "df.index = ['one', 'two', 'three', 'four', 'five']\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['one', 'two', 'three', 'four', 'five'], dtype='object') \n", " (5, 3) \n", " Index(['pop1', 'state1', 'year1'], dtype='object')\n" ] } ], "source": [ "# stats about dataframe\n", "print(df.index, \"\\n\", df.shape, \"\\n\", df.columns)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " pop1 state1 year1\n", "one 1.5 Ohio 2000\n", "two 1.5 Ohio 2001\n", "three 1.5 Ohio 2002\n", "four 1.5 Nevada 2001\n", "five 1.5 Nevada 2002\n" ] } ], "source": [ "df['pop1'] = 1.5\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " pop1 state1 year1\n", "one 0 Ohio 2000\n", "two 1 Ohio 2001\n", "three 2 Ohio 2002\n", "four 3 Nevada 2001\n", "five 4 Nevada 2002\n" ] } ], "source": [ "df['pop1'] = range(5)\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one Ohio\n", "two Ohio\n", "three Ohio\n", "four Nevada\n", "five Nevada\n", "Name: state1, dtype: object\n" ] } ], "source": [ "# can access the data as \n", "print(df['state1'])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one Ohio\n", "two Ohio\n", "three Ohio\n", "four Nevada\n", "five Nevada\n", "Name: state1, dtype: object\n" ] } ], "source": [ "print(df.state1)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state1 year1\n", "one Ohio 2000\n", "two Ohio 2001\n", "three Ohio 2002\n", "four Nevada 2001\n", "five Nevada 2002\n" ] } ], "source": [ "# for deleting any columns\n", "del df['pop1']\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " one two three four five\n", "state1 Ohio Ohio Ohio Nevada Nevada\n", "year1 2000 2001 2002 2001 2002\n" ] } ], "source": [ "# transpose the dataframe\n", "dft = df.T\n", "print(dft)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state1\n", "year1 \n", "2000 Ohio\n", "2001 Ohio\n", "2002 Ohio\n", "2001 Nevada\n", "2002 Nevada\n" ] } ], "source": [ "# using columns as an index\n", "df.index = df['year1']\n", "del df['year1']\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(None, 'year1')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.name, df.index.name" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index(['state1'], dtype='object')" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([['Ohio'],\n", " ['Ohio'],\n", " ['Ohio'],\n", " ['Nevada'],\n", " ['Nevada']], dtype=object)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# printing values\n", "df.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Index methods and properties\n", "```\n", "append Concatenate with additional Index objects, producing a new Index\n", "diff Compute set difference as an Index\n", "intersection Compute set intersection\n", "union Compute set union\n", "isin Compute boolean array indicating whether each value is contained in the passed collection\n", "delete Compute new Index with element at index i deleted\n", "drop Compute new index by deleting passed values\n", "insert Compute new Index by inserting element at index i\n", "is_monotonic Returns True if each element is greater than or equal to the previous element\n", "is_unique Returns True if the Index has no duplicate values\n", "unique Compute the array of unique values in the Index\n", "```" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Int64Index([2000, 2001, 2002, 2001, 2002], dtype='int64', name='year1')" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Series and DataFrames index are mutable\n", "df.index" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#df.index[2]=2009 # this will throw a error" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Reindex Series or DataFrme\n", "```\n", "index New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying\n", "method Interpolation (fill) method, see Table 5-4 for options.\n", "fill_value Substitute value to use when introducing missing data by reindexing\n", "limit When forward- or backfilling, maximum size gap to fill\n", "level Match simple Index on level of MultiIndex, otherwise select subset of\n", "copy Do not copy underlying data if new index is equivalent to old index. True by default (i.e. always copy data).\n", "```" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state1\n", "year1 \n", "2000 Ohio\n", "2001 Ohio\n", "2002 Ohio\n", "2001 Nevada\n", "2002 Nevada\n" ] } ], "source": [ "print(df)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Int64Index([2000, 2001, 2002, 2001, 2002], dtype='int64', name='year1')" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.index" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# df2 = df.reindex([2000, 2001, 2002, 2001, 2002, 2009]) \n", "# this will throw an value error, as index should be unique" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Ohio Texas California\n", "a 0 1 2\n", "c 3 4 5\n", "d 6 7 8\n" ] } ], "source": [ "frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])\n", "print(frame)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Ohio Texas California\n", "a 0.0 1.0 2.0\n", "b NaN NaN NaN\n", "c 3.0 4.0 5.0\n", "d 6.0 7.0 8.0\n" ] } ], "source": [ "frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n", "print(frame2)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state1 year\n", "0 Ohio 2000\n", "1 Ohio 2001\n", "2 Ohio 2002\n", "3 Nevada 2001\n", "4 Nevada 2002\n" ] } ], "source": [ "# likewise let's revert the df\n", "df['year'] = df.index\n", "df.index = [0,1,2,3,4]\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state1 year\n", "1 Ohio 2001.0\n", "2 Ohio 2002.0\n", "3 Nevada 2001.0\n", "4 Nevada 2002.0\n", "5 NaN NaN\n", "6 NaN NaN\n", "7 NaN NaN\n" ] } ], "source": [ "# now we can reindex this df\n", "df2 = df.reindex([1,2,3,4,5,6,7]) # again, reindex will first look into the df and then create the new\n", "print(df2) # as here, it will keep 1,2,3,4 and drop 0 and create new 5,6,7 index" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state1 year\n", "1 Ohio 2001.0\n", "2 Ohio 2002.0\n", "3 Nevada 2001.0\n", "4 Nevada 2002.0\n", "6 NaN NaN\n" ] } ], "source": [ "# better and faster way to do that is - \n", "df3=df2.ix[[1,2,3,4,6]]\n", "print(df3)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state1 year population\n", "1 Ohio 2001.0 NaN\n", "2 Ohio 2002.0 NaN\n", "3 Nevada 2001.0 NaN\n", "4 Nevada 2002.0 NaN\n", "6 NaN NaN NaN\n" ] } ], "source": [ "# CAN ALter the columns as well\n", "new_columns = ['state1', 'year', 'population']\n", "df4 = df3.ix[[1,2,3,4,6], new_columns]\n", "print(df4)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index(['state1', 'year', 'population'], dtype='object')" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4.columns" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state year pop\n", "1 Ohio 2001.0 NaN\n", "2 Ohio 2002.0 NaN\n", "3 Nevada 2001.0 NaN\n", "4 Nevada 2002.0 NaN\n", "6 NaN NaN NaN\n" ] } ], "source": [ "# renaming columns\n", "df4.columns = ['state', 'year', 'pop']\n", "print(df4)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state year pop\n", "1 Ohio 2001.0 NaN\n", "2 Ohio 2002.0 NaN\n", "4 Nevada 2002.0 NaN\n", "6 NaN NaN NaN\n" ] } ], "source": [ "# dropping index or columns\n", "df5=df4.drop([3])\n", "print(df5)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state year\n", "1 Ohio 2001.0\n", "2 Ohio 2002.0\n", "4 Nevada 2002.0\n", "6 NaN NaN\n" ] } ], "source": [ "df5 = df5.drop(['pop'], axis=1)\n", "print(df5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Indexing, selection, and filtering" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateyearpop
1Ohio2001.0NaN
2Ohio2002.0NaN
3Nevada2001.0NaN
4Nevada2002.0NaN
6NaNNaNNaN
\n", "
" ], "text/plain": [ " state year pop\n", "1 Ohio 2001.0 NaN\n", "2 Ohio 2002.0 NaN\n", "3 Nevada 2001.0 NaN\n", "4 Nevada 2002.0 NaN\n", "6 NaN NaN NaN" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateyearpop
1Ohio2001.0NaN
2Ohio2002.0NaN
\n", "
" ], "text/plain": [ " state year pop\n", "1 Ohio 2001.0 NaN\n", "2 Ohio 2002.0 NaN" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4[df4['state']=='Ohio']" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateyear
1Ohio2001.0
2Ohio2002.0
3Nevada2001.0
4Nevada2002.0
6NaNNaN
\n", "
" ], "text/plain": [ " state year\n", "1 Ohio 2001.0\n", "2 Ohio 2002.0\n", "3 Nevada 2001.0\n", "4 Nevada 2002.0\n", "6 NaN NaN" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4[['state', 'year']]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\tools\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " if __name__ == '__main__':\n" ] } ], "source": [ "df4['year'][df4['state']=='Ohio']=2004" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateyearpop
1Ohio2004.0NaN
2Ohio2004.0NaN
3Nevada2001.0NaN
4Nevada2002.0NaN
6NaNNaNNaN
\n", "
" ], "text/plain": [ " state year pop\n", "1 Ohio 2004.0 NaN\n", "2 Ohio 2004.0 NaN\n", "3 Nevada 2001.0 NaN\n", "4 Nevada 2002.0 NaN\n", "6 NaN NaN NaN" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state
1Ohio
2Ohio
\n", "
" ], "text/plain": [ " state\n", "1 Ohio\n", "2 Ohio" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# ix enables you to select a subset of the rows and columns from a DataFrame with NumPy like notation plus axis labels\n", "df4.ix[[1,2],['state']]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statepop
3NevadaNaN
6NaNNaN
\n", "
" ], "text/plain": [ " state pop\n", "3 Nevada NaN\n", "6 NaN NaN" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4.ix[[3,6],[0,2]]" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statepop
3NevadaNaN
4NevadaNaN
\n", "
" ], "text/plain": [ " state pop\n", "3 Nevada NaN\n", "4 Nevada NaN" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4.ix[df4['year']<2003,[0,2]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Indexing options with DataFrame\n", "```\n", "obj[val] Select single column or sequence of columns from the DataFrame. Special case con-veniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion).\n", "obj.ix[val] Selects single row of subset of rows from the DataFrame.\n", "obj.ix[:, val] Selects single column of subset of columns.\n", "obj.ix[val1, val2] Select both rows and columns. \n", "reindex method Conform one or more axes to new indexes. \n", "xs method Select single row or column as a Series by label.\n", "icol, irow methods Select single column or row, respectively, as a Series by integer location.\n", "get_value, set_value methods Select single value by row and column label.\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Arithmetic and data alignment" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])\n", "s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 5.2\n", "c 1.1\n", "d NaN\n", "e 0.0\n", "f NaN\n", "g NaN\n", "dtype: float64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s1 + s2 #assigned NaN for those index which is not found in another series" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])\n", "df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bcde
ColoradoNaNNaNNaNNaN
Ohio3.0NaN6.0NaN
OregonNaNNaNNaNNaN
Texas9.0NaN12.0NaN
UtahNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " b c d e\n", "Colorado NaN NaN NaN NaN\n", "Ohio 3.0 NaN 6.0 NaN\n", "Oregon NaN NaN NaN NaN\n", "Texas 9.0 NaN 12.0 NaN\n", "Utah NaN NaN NaN NaN" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 + df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Arithmetic methods with fill values" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bcde
Colorado6.07.08.0NaN
Ohio3.01.06.05.0
Oregon9.0NaN10.011.0
Texas9.04.012.08.0
Utah0.0NaN1.02.0
\n", "
" ], "text/plain": [ " b c d e\n", "Colorado 6.0 7.0 8.0 NaN\n", "Ohio 3.0 1.0 6.0 5.0\n", "Oregon 9.0 NaN 10.0 11.0\n", "Texas 9.0 4.0 12.0 8.0\n", "Utah 0.0 NaN 1.0 2.0" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.add(df2, fill_value=0)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Ohio0.02.00
Texas3.05.00
Colorado6.08.00
\n", "
" ], "text/plain": [ " b d e\n", "Ohio 0.0 2.0 0\n", "Texas 3.0 5.0 0\n", "Colorado 6.0 8.0 0" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# when reindexing a Series or DataFrame, you can also specify a different fill value\n", "df1.reindex(columns=df2.columns, fill_value=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Flexible arithmetic methods\n", "```\n", "add Method for addition (+)\n", "sub Method for subtraction (-)\n", "div Method for division (/)\n", "mul Method for multiplication (*)\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Operations between DataFrame and Series" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah0.01.02.0
Ohio3.04.05.0
Texas6.07.08.0
Oregon9.010.011.0
\n", "
" ], "text/plain": [ " b d e\n", "Utah 0.0 1.0 2.0\n", "Ohio 3.0 4.0 5.0\n", "Texas 6.0 7.0 8.0\n", "Oregon 9.0 10.0 11.0" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", "frame" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "b 0.0\n", "d 1.0\n", "e 2.0\n", "Name: Utah, dtype: float64" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series = frame.ix[0] # pickng first row\n", "series" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah0.01.04.0
Ohio0.04.010.0
Texas0.07.016.0
Oregon0.010.022.0
\n", "
" ], "text/plain": [ " b d e\n", "Utah 0.0 1.0 4.0\n", "Ohio 0.0 4.0 10.0\n", "Texas 0.0 7.0 16.0\n", "Oregon 0.0 10.0 22.0" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame * series" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah0.00.00.0
Ohio3.03.03.0
Texas6.06.06.0
Oregon9.09.09.0
\n", "
" ], "text/plain": [ " b d e\n", "Utah 0.0 0.0 0.0\n", "Ohio 3.0 3.0 3.0\n", "Texas 6.0 6.0 6.0\n", "Oregon 9.0 9.0 9.0" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns, \n", "# broadcasting down the rows:\n", "frame - series" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bdef
Utah0.0NaN2.0NaN
Ohio0.0NaN5.0NaN
Texas0.0NaN8.0NaN
Oregon0.0NaN11.0NaN
\n", "
" ], "text/plain": [ " b d e f\n", "Utah 0.0 NaN 2.0 NaN\n", "Ohio 0.0 NaN 5.0 NaN\n", "Texas 0.0 NaN 8.0 NaN\n", "Oregon 0.0 NaN 11.0 NaN" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series2 = pd.Series(range(3), index=['b', 'e', 'f'])\n", "frame * series2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Function application and mapping" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": true }, "outputs": [], "source": [ "f = lambda x : x.max() - x.min()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " b d e\n", "Utah 0.572069 1.094135 0.809313\n", "Ohio -0.205726 0.284662 0.193174\n", "Texas -1.313522 0.970605 -1.067873\n", "Oregon 0.052951 0.225374 -0.274396\n" ] } ], "source": [ "frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", "print(frame)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "b 1.885591\n", "d 0.868761\n", "e 1.877187\n", "dtype: float64" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.apply(f)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Utah 0.522065\n", "Ohio 0.490388\n", "Texas 2.284127\n", "Oregon 0.499770\n", "dtype: float64" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.apply(f, axis=1)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# defining a func\n", "def f(x):\n", " return pd.Series([x.max(), x.min()], index=['max', 'min'])" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
max0.5720691.0941350.809313
min-1.3135220.225374-1.067873
\n", "
" ], "text/plain": [ " b d e\n", "max 0.572069 1.094135 0.809313\n", "min -1.313522 0.225374 -1.067873" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.apply(f)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
maxmin
Utah1.0941350.572069
Ohio0.284662-0.205726
Texas0.970605-1.313522
Oregon0.225374-0.274396
\n", "
" ], "text/plain": [ " max min\n", "Utah 1.094135 0.572069\n", "Ohio 0.284662 -0.205726\n", "Texas 0.970605 -1.313522\n", "Oregon 0.225374 -0.274396" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.apply(f, axis=1)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah0.571.090.81
Ohio-0.210.280.19
Texas-1.310.97-1.07
Oregon0.050.23-0.27
\n", "
" ], "text/plain": [ " b d e\n", "Utah 0.57 1.09 0.81\n", "Ohio -0.21 0.28 0.19\n", "Texas -1.31 0.97 -1.07\n", "Oregon 0.05 0.23 -0.27" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "format = lambda x: '%.2f' % x\n", "frame.applymap(format)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sorting and ranking" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "d 0\n", "a 1\n", "b 2\n", "c 3\n", "dtype: int32" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 1\n", "b 2\n", "c 3\n", "d 0\n", "dtype: int32" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sorting on index\n", "obj.sort_index()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dabc
three0123
one4567
\n", "
" ], "text/plain": [ " d a b c\n", "three 0 1 2 3\n", "one 4 5 6 7" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])\n", "frame" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dabc
one4567
three0123
\n", "
" ], "text/plain": [ " d a b c\n", "one 4 5 6 7\n", "three 0 1 2 3" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index()" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
three1230
one5674
\n", "
" ], "text/plain": [ " a b c d\n", "three 1 2 3 0\n", "one 5 6 7 4" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=1)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
one5674
three1230
\n", "
" ], "text/plain": [ " a b c d\n", "one 5 6 7 4\n", "three 1 2 3 0" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=1).sort_index()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dcba
three0321
one4765
\n", "
" ], "text/plain": [ " d c b a\n", "three 0 3 2 1\n", "one 4 7 6 5" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=1, ascending=False)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 2\n", "1 NaN\n", "2 -3\n", "3 5\n", "dtype: object" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To sort a Series by its values, use its order method\n", "sr = pd.Series(['2', np.nan, '-3', '5'])\n", "sr" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2 -3\n", "0 2\n", "3 5\n", "1 NaN\n", "dtype: object" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sorting by value\n", "sr.sort_values()" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
004
117
20-3
312
\n", "
" ], "text/plain": [ " a b\n", "0 0 4\n", "1 1 7\n", "2 0 -3\n", "3 1 2" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n", "frame" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
20-3
312
004
117
\n", "
" ], "text/plain": [ " a b\n", "2 0 -3\n", "3 1 2\n", "0 0 4\n", "1 1 7" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_values(by='b')" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
20-3
004
312
117
\n", "
" ], "text/plain": [ " a b\n", "2 0 -3\n", "0 0 4\n", "3 1 2\n", "1 1 7" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_values(by=['a', 'b'])" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 7\n", "1 -5\n", "2 7\n", "3 4\n", "4 2\n", "5 0\n", "6 4\n", "dtype: int64" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# ranking # Explore more\n", "obj = pd.Series([7, -5, 7, 4, 2, 0, 4])\n", "obj" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 6.5\n", "1 1.0\n", "2 6.5\n", "3 4.5\n", "4 3.0\n", "5 2.0\n", "6 4.5\n", "dtype: float64" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.rank()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Axis indexes with duplicate values" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "a 1\n", "b 2\n", "b 3\n", "c 4\n", "dtype: int32" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['a', 'b', 'c'], dtype=object)" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.index.unique() # get unique index" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.index.is_unique # check if index are unique" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
a-1.0772730.7471230.948973
a1.0971370.7537230.810827
b-1.413813-0.1611470.414814
b-0.6334630.7054390.175328
\n", "
" ], "text/plain": [ " 0 1 2\n", "a -1.077273 0.747123 0.948973\n", "a 1.097137 0.753723 0.810827\n", "b -1.413813 -0.161147 0.414814\n", "b -0.633463 0.705439 0.175328" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])\n", "df" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.index.is_unique" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
a-1.0772730.7471230.948973
a1.0971370.7537230.810827
\n", "
" ], "text/plain": [ " 0 1 2\n", "a -1.077273 0.747123 0.948973\n", "a 1.097137 0.753723 0.810827" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.ix['a'] # ix is used to select rows by index" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 -1.077273\n", "1 0.747123\n", "2 0.948973\n", "Name: a, dtype: float64" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.ix[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }