{ "metadata": { "name": "representing_data" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Representing Data in R -- Python equivalent" ] }, { "cell_type": "code", "collapsed": true, "input": [ "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "# 'characters' is equivalent to string\n", "firstName = 'jeff'\n", "print type(firstName), firstName" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " jeff\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# 'numeric' is equivalent to float\n", "heightCM = 188.2\n", "print type(heightCM), heightCM" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 188.2\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# integer is equivalent to integer\n", "numberSons = 1\n", "print type(numberSons), numberSons" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 1\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# 'logical' is equivalent to Boolean\n", "teachingCoursera = True\n", "print type(teachingCoursera), teachingCoursera" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " True\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "# 'vectors' is equivalent to numpy array or Python list (I will use array everywhere for consistency)\n", "heights = np.array([188.2, 181.3, 193.4])\n", "print heights\n", "\n", "firstNames = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", "print firstNames" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[ 188.2 181.3 193.4]\n", "['jeff' 'roger' 'andrew' 'brian']\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "# 'list' is equivalent to dictionary in Python\n", "vector1 = np.array([188.2, 181.3, 193.4])\n", "vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", "myList = dict(heights = vector1, firstNames = vector2)\n", "print myList\n", "\n", "print myList['heights']\n", "print myList['firstNames']" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{'firstNames': array(['jeff', 'roger', 'andrew', 'brian'], \n", " dtype='|S6'), 'heights': array([ 188.2, 181.3, 193.4])}\n", "[ 188.2 181.3 193.4]\n", "['jeff' 'roger' 'andrew' 'brian']\n" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# 'matrices' is equivalent to two-dimensional numpy array\n", "myMatrix = np.array([[1, 2], [3, 4]])\n", "print myMatrix" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[[1 2]\n", " [3 4]]\n" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# data frame is equivalent to Pandas DataFrame\n", "# this example doesn't work because the input array lengths are not the same\n", "vector1 = np.array([188.2, 181.3, 193.4])\n", "vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", "\n", "# ValueError: arrays must all be same length\n", "# \n", "myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "arrays must all be same length", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;31m# ValueError: arrays must all be same length\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mmyDataFrame\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheights\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvector1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfirstNames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvector2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_init_mgr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 384\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_init_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 386\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMaskedArray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetmaskarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_init_dict\u001b[0;34m(self, data, index, columns, dtype)\u001b[0m\n\u001b[1;32m 515\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 516\u001b[0m return _arrays_to_mgr(arrays, data_names, index, columns,\n\u001b[0;32m--> 517\u001b[0;31m dtype=dtype)\n\u001b[0m\u001b[1;32m 518\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 519\u001b[0m def _init_ndarray(self, values, index, columns, dtype=None,\n", "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_arrays_to_mgr\u001b[0;34m(arrays, arr_names, index, columns, dtype)\u001b[0m\n\u001b[1;32m 5343\u001b[0m \u001b[0;31m# figure out the index, if necessary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5344\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5345\u001b[0;31m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5346\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5347\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_ensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36mextract_index\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 5395\u001b[0m \u001b[0mlengths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_lengths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5396\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlengths\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5397\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'arrays must all be same length'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5398\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5399\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhave_dicts\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: arrays must all be same length" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "# data frame -- fixed\n", "vector1 = np.array([188.2, 181.3, 193.4, 192.3])\n", "vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", "\n", "myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))\n", "myDataFrame" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstNamesheights
0 jeff 188.2
1 roger 181.3
2 andrew 193.4
3 brian 192.3
\n", "
" ], "output_type": "pyout", "prompt_number": 11, "text": [ " firstNames heights\n", "0 jeff 188.2\n", "1 roger 181.3\n", "2 andrew 193.4\n", "3 brian 192.3" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "# factors is equivalent to pandas Categorical\n", "smoker = np.array(['yes', 'no', 'yes', 'yes'])\n", "smokerFactor = pd.Categorical.from_array(smoker)\n", "smokerFactor" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 12, "text": [ "Categorical: \n", "array(['yes', 'no', 'yes', 'yes'], dtype=object)\n", "Levels (2): Index(['no', 'yes'], dtype=object)" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "# R's NA missing values is equivalent to NaN\n", "vector1 = np.array([188.2, 181.3, 193.4, NaN])\n", "print vector1\n", "print isnan(vector1)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[ 188.2 181.3 193.4 nan]\n", "[False False False True]\n" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "# subsetting\n", "vector1 = np.array([188.2, 181.3, 193.4, 192.3])\n", "vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n", "\n", "myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))\n", "\n", "print '------------------'\n", "print vector1[0]\n", "print '------------------'\n", "print vector1[[0, 1, 3]]\n", "print '------------------'\n", "print myDataFrame.ix[0, 0:2] # appears transposed as compared to R\n", "print '------------------'\n", "print myDataFrame['firstNames'] # there's no 'Levels' as in R\n", "print '------------------'\n", "print myDataFrame[myDataFrame['firstNames'] == 'jeff']\n", "print '------------------'\n", "print myDataFrame[myDataFrame['heights'] < 190]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "------------------\n", "188.2\n", "------------------\n", "[ 188.2 181.3 192.3]\n", "------------------\n", "firstNames jeff\n", "heights 188.2\n", "Name: 0\n", "------------------\n", "0 jeff\n", "1 roger\n", "2 andrew\n", "3 brian\n", "Name: firstNames\n", "------------------\n", " firstNames heights\n", "0 jeff 188.2\n", "------------------\n", " firstNames heights\n", "0 jeff 188.2\n", "1 roger 181.3\n" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": true, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }