{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Predictive Modeling with heterogeneous data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import warnings\n", "warnings.simplefilter('ignore', DeprecationWarning)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 0 }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Loading tabular data from the Titanic kaggle challenge in a pandas Data Frame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us have a look at the Titanic dataset from the Kaggle Getting Started challenge at:\n", "\n", "https://www.kaggle.com/c/titanic-gettingStarted\n", "\n", "We can load the CSV file as a pandas data frame in one line:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#!curl -s https://dl.dropboxusercontent.com/u/5743203/data/titanic/titanic_train.csv | head -5\n", "!head -5 titanic_train.csv" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked\r\n", "1,0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S\r\n", "2,1,1,\"Cumings, Mrs. John Bradley (Florence Briggs Thayer)\",female,38,1,0,PC 17599,71.2833,C85,C\r\n", "3,1,3,\"Heikkinen, Miss. Laina\",female,26,0,0,STON/O2. 3101282,7.925,,S\r\n", "4,1,1,\"Futrelle, Mrs. Jacques Heath (Lily May Peel)\",female,35,1,0,113803,53.1,C123,S\r\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "#data = pd.read_csv('https://dl.dropboxusercontent.com/u/5743203/data/titanic/titanic_train.csv')\n", "data = pd.read_csv('titanic_train.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "pandas data frames have a HTML table representation in the IPython notebook. Let's have a look at the first 5 rows:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data.head(5)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | PassengerId | \n", "Survived | \n", "Pclass | \n", "Name | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Ticket | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "0 | \n", "3 | \n", "Braund, Mr. Owen Harris | \n", "male | \n", "22 | \n", "1 | \n", "0 | \n", "A/5 21171 | \n", "7.2500 | \n", "NaN | \n", "S | \n", "
1 | \n", "2 | \n", "1 | \n", "1 | \n", "Cumings, Mrs. John Bradley (Florence Briggs Th... | \n", "female | \n", "38 | \n", "1 | \n", "0 | \n", "PC 17599 | \n", "71.2833 | \n", "C85 | \n", "C | \n", "
2 | \n", "3 | \n", "1 | \n", "3 | \n", "Heikkinen, Miss. Laina | \n", "female | \n", "26 | \n", "0 | \n", "0 | \n", "STON/O2. 3101282 | \n", "7.9250 | \n", "NaN | \n", "S | \n", "
3 | \n", "4 | \n", "1 | \n", "1 | \n", "Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n", "female | \n", "35 | \n", "1 | \n", "0 | \n", "113803 | \n", "53.1000 | \n", "C123 | \n", "S | \n", "
4 | \n", "5 | \n", "0 | \n", "3 | \n", "Allen, Mr. William Henry | \n", "male | \n", "35 | \n", "0 | \n", "0 | \n", "373450 | \n", "8.0500 | \n", "NaN | \n", "S | \n", "
5 rows \u00d7 12 columns
\n", "\n", " | PassengerId | \n", "Survived | \n", "Pclass | \n", "Name | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Ticket | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
Survived | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
0 | \n", "549 | \n", "549 | \n", "549 | \n", "549 | \n", "549 | \n", "424 | \n", "549 | \n", "549 | \n", "549 | \n", "549 | \n", "68 | \n", "549 | \n", "
1 | \n", "342 | \n", "342 | \n", "342 | \n", "342 | \n", "342 | \n", "290 | \n", "342 | \n", "342 | \n", "342 | \n", "342 | \n", "136 | \n", "340 | \n", "
2 rows \u00d7 12 columns
\n", "\n", " | Fare | \n", "Pclass | \n", "Age | \n", "
---|---|---|---|
0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "
1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "
2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "
3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "
4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "
5 rows \u00d7 3 columns
\n", "\n", " | Fare | \n", "Pclass | \n", "Age | \n", "
---|---|---|---|
0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "
1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "
2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "
3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "
4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "
5 rows \u00d7 3 columns
\n", "\n", " | Sex_female | \n", "Sex_male | \n", "
---|---|---|
0 | \n", "0 | \n", "1 | \n", "
1 | \n", "1 | \n", "0 | \n", "
2 | \n", "1 | \n", "0 | \n", "
3 | \n", "1 | \n", "0 | \n", "
4 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 2 columns
\n", "\n", " | Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
---|---|---|---|
0 | \n", "0 | \n", "0 | \n", "1 | \n", "
1 | \n", "1 | \n", "0 | \n", "0 | \n", "
2 | \n", "0 | \n", "0 | \n", "1 | \n", "
3 | \n", "0 | \n", "0 | \n", "1 | \n", "
4 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 3 columns
\n", "\n", " | Fare | \n", "Pclass | \n", "Age | \n", "Sex_female | \n", "Sex_male | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 8 columns
\n", "\n", " | Fare | \n", "Pclass | \n", "Age | \n", "Sex_female | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "
2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 7 columns
\n", "\n", " | Fare | \n", "Pclass | \n", "Age | \n", "Sex_female | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "
2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 7 columns
\n", "\n", " | Fare | \n", "Age | \n", "Sex_female | \n", "Pclass_1 | \n", "Pclass_2 | \n", "Pclass_3 | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "7.2500 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
1 | \n", "71.2833 | \n", "38 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
2 | \n", "7.9250 | \n", "26 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
3 | \n", "53.1000 | \n", "35 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
4 | \n", "8.0500 | \n", "35 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 9 columns
\n", "