{
 "metadata": {
  "name": "representing_data"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Representing Data in R -- Python equivalent"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "import pandas as pd\n",
      "import numpy as np"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# 'characters' is equivalent to string\n",
      "firstName = 'jeff'\n",
      "print type(firstName), firstName"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<type 'str'> jeff\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# 'numeric' is equivalent to float\n",
      "heightCM = 188.2\n",
      "print type(heightCM), heightCM"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<type 'float'> 188.2\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# integer is equivalent to integer\n",
      "numberSons = 1\n",
      "print type(numberSons), numberSons"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<type 'int'> 1\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# 'logical' is equivalent to Boolean\n",
      "teachingCoursera = True\n",
      "print type(teachingCoursera), teachingCoursera"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<type 'bool'> True\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# 'vectors' is equivalent to numpy array or Python list (I will use array everywhere for consistency)\n",
      "heights = np.array([188.2, 181.3, 193.4])\n",
      "print heights\n",
      "\n",
      "firstNames = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
      "print firstNames"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[ 188.2  181.3  193.4]\n",
        "['jeff' 'roger' 'andrew' 'brian']\n"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# 'list' is equivalent to dictionary in Python\n",
      "vector1 = np.array([188.2, 181.3, 193.4])\n",
      "vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
      "myList = dict(heights = vector1, firstNames = vector2)\n",
      "print myList\n",
      "\n",
      "print myList['heights']\n",
      "print myList['firstNames']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "{'firstNames': array(['jeff', 'roger', 'andrew', 'brian'], \n",
        "      dtype='|S6'), 'heights': array([ 188.2,  181.3,  193.4])}\n",
        "[ 188.2  181.3  193.4]\n",
        "['jeff' 'roger' 'andrew' 'brian']\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# 'matrices' is equivalent to two-dimensional numpy array\n",
      "myMatrix = np.array([[1, 2], [3, 4]])\n",
      "print myMatrix"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[[1 2]\n",
        " [3 4]]\n"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# data frame is equivalent to Pandas DataFrame\n",
      "# this example doesn't work because the input array lengths are not the same\n",
      "vector1 = np.array([188.2, 181.3, 193.4])\n",
      "vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
      "\n",
      "# ValueError: arrays must all be same length\n",
      "# \n",
      "myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "ename": "ValueError",
       "evalue": "arrays must all be same length",
       "output_type": "pyerr",
       "traceback": [
        "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
        "\u001b[0;32m<ipython-input-10-58e1535d1fac>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;31m# ValueError: arrays must all be same length\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mmyDataFrame\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheights\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvector1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfirstNames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvector2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
        "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m    383\u001b[0m             \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_init_mgr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    384\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m             \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_init_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    386\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMaskedArray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    387\u001b[0m             \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetmaskarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_init_dict\u001b[0;34m(self, data, index, columns, dtype)\u001b[0m\n\u001b[1;32m    515\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    516\u001b[0m         return _arrays_to_mgr(arrays, data_names, index, columns,\n\u001b[0;32m--> 517\u001b[0;31m                               dtype=dtype)\n\u001b[0m\u001b[1;32m    518\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    519\u001b[0m     def _init_ndarray(self, values, index, columns, dtype=None,\n",
        "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36m_arrays_to_mgr\u001b[0;34m(arrays, arr_names, index, columns, dtype)\u001b[0m\n\u001b[1;32m   5343\u001b[0m     \u001b[0;31m# figure out the index, if necessary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5344\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5345\u001b[0;31m         \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   5346\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5347\u001b[0m         \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_ensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;32m/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc\u001b[0m in \u001b[0;36mextract_index\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m   5395\u001b[0m             \u001b[0mlengths\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_lengths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5396\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlengths\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5397\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'arrays must all be same length'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   5398\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5399\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mhave_dicts\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
        "\u001b[0;31mValueError\u001b[0m: arrays must all be same length"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# data frame -- fixed\n",
      "vector1 = np.array([188.2, 181.3, 193.4, 192.3])\n",
      "vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
      "\n",
      "myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))\n",
      "myDataFrame"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>firstNames</th>\n",
        "      <th>heights</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>   jeff</td>\n",
        "      <td> 188.2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>  roger</td>\n",
        "      <td> 181.3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> andrew</td>\n",
        "      <td> 193.4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>  brian</td>\n",
        "      <td> 192.3</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "output_type": "pyout",
       "prompt_number": 11,
       "text": [
        "  firstNames  heights\n",
        "0       jeff    188.2\n",
        "1      roger    181.3\n",
        "2     andrew    193.4\n",
        "3      brian    192.3"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# factors is equivalent to pandas Categorical\n",
      "smoker = np.array(['yes', 'no', 'yes', 'yes'])\n",
      "smokerFactor = pd.Categorical.from_array(smoker)\n",
      "smokerFactor"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 12,
       "text": [
        "Categorical: \n",
        "array(['yes', 'no', 'yes', 'yes'], dtype=object)\n",
        "Levels (2): Index(['no', 'yes'], dtype=object)"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# R's NA missing values is equivalent to NaN\n",
      "vector1 = np.array([188.2, 181.3, 193.4, NaN])\n",
      "print vector1\n",
      "print isnan(vector1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[ 188.2  181.3  193.4    nan]\n",
        "[False False False  True]\n"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# subsetting\n",
      "vector1 = np.array([188.2, 181.3, 193.4, 192.3])\n",
      "vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])\n",
      "\n",
      "myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))\n",
      "\n",
      "print '------------------'\n",
      "print vector1[0]\n",
      "print '------------------'\n",
      "print vector1[[0, 1, 3]]\n",
      "print '------------------'\n",
      "print myDataFrame.ix[0, 0:2] # appears transposed as compared to R\n",
      "print '------------------'\n",
      "print myDataFrame['firstNames'] # there's no 'Levels' as in R\n",
      "print '------------------'\n",
      "print myDataFrame[myDataFrame['firstNames'] == 'jeff']\n",
      "print '------------------'\n",
      "print myDataFrame[myDataFrame['heights'] < 190]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "------------------\n",
        "188.2\n",
        "------------------\n",
        "[ 188.2  181.3  192.3]\n",
        "------------------\n",
        "firstNames     jeff\n",
        "heights       188.2\n",
        "Name: 0\n",
        "------------------\n",
        "0      jeff\n",
        "1     roger\n",
        "2    andrew\n",
        "3     brian\n",
        "Name: firstNames\n",
        "------------------\n",
        "  firstNames  heights\n",
        "0       jeff    188.2\n",
        "------------------\n",
        "  firstNames  heights\n",
        "0       jeff    188.2\n",
        "1      roger    181.3\n"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}