{ "metadata": { "name": "structure_of_a_data_analysis" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Structure of a data analysis -- in Python" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import pandas as pd\n", "import pandas.rpy.common as com" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "# load the data\n", "spam = com.load_data('spam', package='kernlab')\n", "\n", "# get the number of rows and columns in the data\n", "spam.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 5, "text": [ "(4601, 58)" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we'll use rpy2 interface to get the same trainIndicator variable as in the course, since it uses a seed to randomly sample the data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%load_ext rmagic" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "%%R -o trainIndicator\n", "set.seed(3435)\n", "trainIndicator = rbinom(4601,size=1,prob=0.5)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now the trainIndicator variable is in Python namespace, stored as numpy array" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# this is equivalent to the R table command\n", "pd.Series(trainIndicator).value_counts()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 8, "text": [ "0 2314\n", "1 2287" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "## another way to do it completely in Python:\n", "# np.random.seed(3435)\n", "# trainIndicator = np.random.binomial(1, 0.5, 4601)\n", "# np.bincount(trainIndicator)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "# split into training and test data\n", "trainSpam = spam[trainIndicator == 1]\n", "testSpam = spam[trainIndicator == 0]\n", "trainSpam.shape" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 10, "text": [ "(2287, 58)" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "# show column names\n", "print trainSpam.columns " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Index([make, address, all, num3d, our, over, remove, internet, order, mail, receive, will, people, report, addresses, free, business, email, you, credit, your, font, num000, money, hp, hpl, george, num650, lab, labs, telnet, num857, data, num415, num85, technology, num1999, parts, pm, direct, cs, meeting, original, project, re, edu, table, conference, charSemicolon, charRoundbracket, charSquarebracket, charExclamation, charDollar, charHash, capitalAve, capitalLong, capitalTotal, type], dtype=object)\n" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "# this is the equivalent of R head command\n", "trainSpam.ix[:,:20].head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | make | \n", "address | \n", "all | \n", "num3d | \n", "our | \n", "over | \n", "remove | \n", "internet | \n", "order | \n", "receive | \n", "will | \n", "people | \n", "report | \n", "addresses | \n", "free | \n", "business | \n", "you | \n", "credit | \n", "||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "0.00 | \n", "0.64 | \n", "0.64 | \n", "0 | \n", "0.32 | \n", "0.00 | \n", "0.00 | \n", "0 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.64 | \n", "0.00 | \n", "0 | \n", "0 | \n", "0.32 | \n", "0 | \n", "1.29 | \n", "1.93 | \n", "0.00 | \n", "
7 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0 | \n", "1.92 | \n", "0.00 | \n", "0.00 | \n", "0 | \n", "0.00 | \n", "0.64 | \n", "0.96 | \n", "1.28 | \n", "0.00 | \n", "0 | \n", "0 | \n", "0.96 | \n", "0 | \n", "0.32 | \n", "3.85 | \n", "0.00 | \n", "
9 | \n", "0.15 | \n", "0.00 | \n", "0.46 | \n", "0 | \n", "0.61 | \n", "0.00 | \n", "0.30 | \n", "0 | \n", "0.92 | \n", "0.76 | \n", "0.76 | \n", "0.92 | \n", "0.00 | \n", "0 | \n", "0 | \n", "0.00 | \n", "0 | \n", "0.15 | \n", "1.23 | \n", "3.53 | \n", "
12 | \n", "0.00 | \n", "0.00 | \n", "0.25 | \n", "0 | \n", "0.38 | \n", "0.25 | \n", "0.25 | \n", "0 | \n", "0.00 | \n", "0.00 | \n", "0.12 | \n", "0.12 | \n", "0.12 | \n", "0 | \n", "0 | \n", "0.00 | \n", "0 | \n", "0.00 | \n", "1.16 | \n", "0.00 | \n", "
14 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0 | \n", "0.90 | \n", "0.00 | \n", "0.90 | \n", "0 | \n", "0.00 | \n", "0.90 | \n", "0.90 | \n", "0.00 | \n", "0.90 | \n", "0 | \n", "0 | \n", "0.00 | \n", "0 | \n", "0.00 | \n", "2.72 | \n", "0.00 | \n", "