{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Predictive Modeling with heterogeneous data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import warnings\n",
"warnings.simplefilter('ignore', DeprecationWarning)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 0
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Loading tabular data from the Titanic kaggle challenge in a pandas Data Frame"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let us have a look at the Titanic dataset from the Kaggle Getting Started challenge at:\n",
"\n",
"https://www.kaggle.com/c/titanic-gettingStarted\n",
"\n",
"We can load the CSV file as a pandas data frame in one line:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#!curl -s https://dl.dropboxusercontent.com/u/5743203/data/titanic/titanic_train.csv | head -5\n",
"!head -5 titanic_train.csv"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked\r\n",
"1,0,3,\"Braund, Mr. Owen Harris\",male,22,1,0,A/5 21171,7.25,,S\r\n",
"2,1,1,\"Cumings, Mrs. John Bradley (Florence Briggs Thayer)\",female,38,1,0,PC 17599,71.2833,C85,C\r\n",
"3,1,3,\"Heikkinen, Miss. Laina\",female,26,0,0,STON/O2. 3101282,7.925,,S\r\n",
"4,1,1,\"Futrelle, Mrs. Jacques Heath (Lily May Peel)\",female,35,1,0,113803,53.1,C123,S\r\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#data = pd.read_csv('https://dl.dropboxusercontent.com/u/5743203/data/titanic/titanic_train.csv')\n",
"data = pd.read_csv('titanic_train.csv')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"pandas data frames have a HTML table representation in the IPython notebook. Let's have a look at the first 5 rows:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"data.head(5)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"
| \n", " | PassengerId | \n", "Survived | \n", "Pclass | \n", "Name | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Ticket | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "1 | \n", "0 | \n", "3 | \n", "Braund, Mr. Owen Harris | \n", "male | \n", "22 | \n", "1 | \n", "0 | \n", "A/5 21171 | \n", "7.2500 | \n", "NaN | \n", "S | \n", "
| 1 | \n", "2 | \n", "1 | \n", "1 | \n", "Cumings, Mrs. John Bradley (Florence Briggs Th... | \n", "female | \n", "38 | \n", "1 | \n", "0 | \n", "PC 17599 | \n", "71.2833 | \n", "C85 | \n", "C | \n", "
| 2 | \n", "3 | \n", "1 | \n", "3 | \n", "Heikkinen, Miss. Laina | \n", "female | \n", "26 | \n", "0 | \n", "0 | \n", "STON/O2. 3101282 | \n", "7.9250 | \n", "NaN | \n", "S | \n", "
| 3 | \n", "4 | \n", "1 | \n", "1 | \n", "Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n", "female | \n", "35 | \n", "1 | \n", "0 | \n", "113803 | \n", "53.1000 | \n", "C123 | \n", "S | \n", "
| 4 | \n", "5 | \n", "0 | \n", "3 | \n", "Allen, Mr. William Henry | \n", "male | \n", "35 | \n", "0 | \n", "0 | \n", "373450 | \n", "8.0500 | \n", "NaN | \n", "S | \n", "
5 rows \u00d7 12 columns
\n", "| \n", " | PassengerId | \n", "Survived | \n", "Pclass | \n", "Name | \n", "Sex | \n", "Age | \n", "SibSp | \n", "Parch | \n", "Ticket | \n", "Fare | \n", "Cabin | \n", "Embarked | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Survived | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
| 0 | \n", "549 | \n", "549 | \n", "549 | \n", "549 | \n", "549 | \n", "424 | \n", "549 | \n", "549 | \n", "549 | \n", "549 | \n", "68 | \n", "549 | \n", "
| 1 | \n", "342 | \n", "342 | \n", "342 | \n", "342 | \n", "342 | \n", "290 | \n", "342 | \n", "342 | \n", "342 | \n", "342 | \n", "136 | \n", "340 | \n", "
2 rows \u00d7 12 columns
\n", "| \n", " | Fare | \n", "Pclass | \n", "Age | \n", "
|---|---|---|---|
| 0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "
| 1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "
| 2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "
| 3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "
| 4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "
5 rows \u00d7 3 columns
\n", "| \n", " | Fare | \n", "Pclass | \n", "Age | \n", "
|---|---|---|---|
| 0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "
| 1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "
| 2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "
| 3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "
| 4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "
5 rows \u00d7 3 columns
\n", "| \n", " | Sex_female | \n", "Sex_male | \n", "
|---|---|---|
| 0 | \n", "0 | \n", "1 | \n", "
| 1 | \n", "1 | \n", "0 | \n", "
| 2 | \n", "1 | \n", "0 | \n", "
| 3 | \n", "1 | \n", "0 | \n", "
| 4 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 2 columns
\n", "| \n", " | Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
|---|---|---|---|
| 0 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 1 | \n", "1 | \n", "0 | \n", "0 | \n", "
| 2 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 3 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 4 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 3 columns
\n", "| \n", " | Fare | \n", "Pclass | \n", "Age | \n", "Sex_female | \n", "Sex_male | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
| 2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 8 columns
\n", "| \n", " | Fare | \n", "Pclass | \n", "Age | \n", "Sex_female | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "
| 2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 7 columns
\n", "| \n", " | Fare | \n", "Pclass | \n", "Age | \n", "Sex_female | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
|---|---|---|---|---|---|---|---|
| 0 | \n", "7.2500 | \n", "3 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 1 | \n", "71.2833 | \n", "1 | \n", "38 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "
| 2 | \n", "7.9250 | \n", "3 | \n", "26 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 3 | \n", "53.1000 | \n", "1 | \n", "35 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 4 | \n", "8.0500 | \n", "3 | \n", "35 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 7 columns
\n", "| \n", " | Fare | \n", "Age | \n", "Sex_female | \n", "Pclass_1 | \n", "Pclass_2 | \n", "Pclass_3 | \n", "Embarked_C | \n", "Embarked_Q | \n", "Embarked_S | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "7.2500 | \n", "22 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 1 | \n", "71.2833 | \n", "38 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "
| 2 | \n", "7.9250 | \n", "26 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 3 | \n", "53.1000 | \n", "35 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
| 4 | \n", "8.0500 | \n", "35 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
5 rows \u00d7 9 columns
\n", "