{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Titanic Dataset: Basic Data Exploration\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "#Import Libraries\n", "from matplotlib import pyplot as plt\n", "import sklearn\n", "import pandas as pd\n", "import numpy as np\n", "from scipy.stats import pearsonr" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "#Import Dataset\n", "train_dataset = pd.read_csv(\"dataset/train.csv\")\n", "test_datset = pd.read_csv(\"dataset/test.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harris122.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...038.010PC 1759971.2833C85C
2313Heikkinen, Miss. Laina026.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)035.01011380353.1000C123S
4503Allen, Mr. William Henry135.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp Parch \\\n", "0 Braund, Mr. Owen Harris 1 22.0 1 0 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 \n", "2 Heikkinen, Miss. Laina 0 26.0 0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.0 1 0 \n", "4 Allen, Mr. William Henry 1 35.0 0 0 \n", "\n", " Ticket Fare Cabin Embarked \n", "0 A/5 21171 7.2500 NaN S \n", "1 PC 17599 71.2833 C85 C \n", "2 STON/O2. 3101282 7.9250 NaN S \n", "3 113803 53.1000 C123 S \n", "4 373450 8.0500 NaN S " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Chnage Male/Female to numeric classes\n", "train_dataset['Sex'].replace(to_replace=dict(female=0, male=1), inplace=True)\n", "train_dataset.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvived...ParchFare
countmeanstdmin25%50%75%maxcountmean...75%maxcountmeanstdmin25%50%75%max
0549.0447.016393260.6404691.0211.00455.0675.0891.0549.00.0...0.06.0549.022.11788731.3882070.07.854210.526.0263.0000
1342.0444.368421252.3588402.0250.75439.5651.5890.0342.01.0...1.05.0342.048.39540866.5969980.012.475026.057.0512.3292
\n", "

2 rows × 64 columns

\n", "
" ], "text/plain": [ " PassengerId \\\n", " count mean std min 25% 50% 75% max \n", "0 549.0 447.016393 260.640469 1.0 211.00 455.0 675.0 891.0 \n", "1 342.0 444.368421 252.358840 2.0 250.75 439.5 651.5 890.0 \n", "\n", " Survived ... Parch Fare \\\n", " count mean ... 75% max count mean std min \n", "0 549.0 0.0 ... 0.0 6.0 549.0 22.117887 31.388207 0.0 \n", "1 342.0 1.0 ... 1.0 5.0 342.0 48.395408 66.596998 0.0 \n", "\n", " \n", " 25% 50% 75% max \n", "0 7.8542 10.5 26.0 263.0000 \n", "1 12.4750 26.0 57.0 512.3292 \n", "\n", "[2 rows x 64 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_dataset.groupby('Survived',as_index=False).describe()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Exploratory Data Analysis " ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### How does Gender affect chances of Survival?" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pearson Correlation Test = -0.5433513806577553\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "male = []\n", "female = []\n", "for i in [0, 1]:\n", " male.append(len(train_dataset.loc[(train_dataset['Survived'] == i) & (train_dataset['Sex'] == 1)]) )\n", " female.append(len(train_dataset.loc[(train_dataset['Survived'] == i) & (train_dataset['Sex'] == 0)]))\n", "\n", "ind = np.arange(len(male)) \n", "width = 0.3 \n", "fig, ax = plt.subplots(figsize=(8, 5))\n", "ax.bar(ind - width/2, male, width, color='SkyBlue', label='Men')\n", "ax.bar(ind + width/2, female, width, color='IndianRed', label='Women')\n", "\n", "\n", "ax.set_ylabel('Number of people')\n", "ax.set_xlabel('Survived')\n", "ax.set_title('Survivors by gender')\n", "ax.set_xticks(ind)\n", "ax.set_xticklabels([\"0\", \"1\"])\n", "ax.legend()\n", "\n", "print(\"Pearson Correlation Test = {}\".format(pearsonr(train_dataset[\"Sex\"], train_dataset[\"Survived\"])[0]) )" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### How does Class affect chances of Survival?" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": false }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Pearson Correlation Test = -0.33848103596101536\n" ] } ], "source": [ "uniq_pclass = [1,2,3]\n", "Pclass_survivors = []\n", "total_survivors = []\n", "for i in uniq_pclass:\n", " Pclass_survivors.append(len(train_dataset.loc[(train_dataset['Survived'] == 1) & (train_dataset['Pclass'] == i)]) )\n", " total_survivors.append(len(train_dataset.loc[(train_dataset['Pclass'] == i)]) )\n", "ind = np.arange(len(Pclass_survivors)) \n", "width = 0.35 \n", "\n", "fig, ax = plt.subplots(figsize=(8, 5))\n", "ax.bar(ind, total_survivors, width, color='IndianRed', label=\"dead folks\")\n", "ax.bar(ind, Pclass_survivors, width, color='SkyBlue', label=\"survivors\")\n", "\n", "ax.set_ylabel('Survivors')\n", "ax.set_xlabel('Pclass')\n", "ax.set_title('Amount of Survivors per class')\n", "ax.set_xticks(ind)\n", "ax.set_xticklabels(uniq_pclass)\n", "ax.legend()\n", "plt.show()\n", "\n", "print(\"Pearson Correlation Test = {}\".format(pearsonr(train_dataset[\"Pclass\"], train_dataset[\"Survived\"])[0]))" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### The effect of Age on Survival?" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pearson Correlation Test = -0.25408475420305304\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAFACAYAAABQsW5nAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAHUBJREFUeJzt3XuUXWWZ5/Hvk4srXMWEhIUErKDcbIEYKggNooACKob7SoDRDIPSLmEagtoSRkXGRqRbRYFpIC1KmLYTMJgEaVqJGJRm7EguCIEYoDFACSZFlHCRS0Ke+ePsQBEqlZOqOrdd389atc7Z++xz9vOuOpVf3n1538hMJElSuQxqdAGSJKn/GfCSJJWQAS9JUgkZ8JIklZABL0lSCRnwkiSVkAEvSVIJGfCSJJWQAS9JUgkNaXQBfbHjjjtmW1tbo8uQJKluFi1a9HRmjtzcdi0d8G1tbSxcuLDRZUiSVDcR8Vg123mIXpKkEjLgJUkqIQNekqQSaulz8JKk8lu7di0dHR289NJLjS6lroYNG8bo0aMZOnRor95vwEuSmlpHRwfbbbcdbW1tRESjy6mLzGT16tV0dHQwZsyYXn2Gh+glSU3tpZdeYsSIEQMm3AEighEjRvTpqIUBL0lqegMp3Dfoa5sNeEmSSshz8JKklnL5vIf69fOmfHjPqrabPXs2J554IsuWLWPvvfdmxYoVHHvssSxdupSFCxdyww03cMUVV/RrbX1hD16SpCrMmDGDQw89lJkzZ77ptfb29qYKdzDgJUnarOeff567776b6667rtuAv/POOzn22GNZv349bW1tPPPMM6+99q53vYuVK1fS2dnJSSedxPjx4xk/fjx33313TWv2EL1UrfmX1m9fh0+t374kbdacOXM45phj2HPPPRk+fDiLFy9m+PDhb9pu0KBBHHfcccyePZszzjiDBQsW0NbWxk477cRpp53GlClTOPTQQ3n88cc5+uijWbZsWc1qtgcvSdJmzJgxg0mTJgEwadIkZsyYscltJ06cyI033gjAzJkzmThxIgA///nPOeeccxg7diwTJkzg2Wef5bnnnqtZzfbgJUnqwerVq/nFL37B0qVLiQheffVVIoLPfvaz3W5/8MEH88gjj9DZ2cmcOXP40pe+BMD69ev59a9/zVZbbVWXuu3BS5LUg1mzZvHJT36Sxx57jBUrVvDEE08wZswYOjo6ut0+IjjhhBM4//zz2WeffRgxYgQARx11FFddddVr29177701rdsevCSppVR7W1t/mTFjBhdccMEb1p100kl8/etf3+R7Jk6cyPjx47n++utfW3fFFVdw9tlns99++7Fu3ToOO+wwrrnmmlqVTWRmzT681trb23PhwoWNLkMDhRfZSQ2xbNky9tlnn0aX0RDdtT0iFmVm++be6yF6SZJKyICXJKmEDHhJkkrIgJckqYQMeEmSSqhmAR8R34+IVRGxtMu6f4yI30XEfRExOyJ26PLa1Ih4JCKWR8TRtapLkqSBoJb3wV8PXAXc0GXdPGBqZq6LiMuAqcAXI+LdwCTgr4C3Az+PiD0z89Ua1idJakX9fctqFbelDh48mH333Ze1a9cyZMgQJk+ezHnnncegQX3vJ3/1q19l22235fOf/3yfP6urmgV8Zv4qIto2Wnd7l8X/BE4unh8HzMzMl4HfR8QjwIHAr2tVnyRJ1dpqq61eG3lu1apVnHbaaaxZs4aLL764wZVtWiPPwf8P4N+L57sAT3R5raNY9yYRcVZELIyIhZ2dnTUuUZKkNxo1ahTTpk3jqquuIjN59dVX+cIXvsD48ePZb7/9uPbaa4HKFLNHHnkk48aNY99992Xu3LmvfcYll1zCXnvtxYc+9CGWL19ekzobMlRtRPwvYB3www2rutms2yH2MnMaMA0qI9nVpEBJknqw++67s379elatWsXcuXN561vfyj333MPLL7/MIYccwlFHHcWuu+7K7Nmz2X777Xn66ac56KCDmDBhAosXL2bmzJksWbKEdevWMW7cOA444IB+r7HuAR8Rk4FjgSPz9XFyO4Bdu2w2Gniy3rVJklStDRF2++23c9999zFr1iwA1qxZw8MPP8zo0aO58MIL+dWvfsWgQYP4wx/+wMqVK7nrrrs44YQT2HrrrQGYMGFCTeqra8BHxDHAF4EPZOZfurx0C/CvEfFtKhfZ7QH8pp61SZJUrUcffZTBgwczatQoMpMrr7ySo49+4w1g119/PZ2dnSxatIihQ4fS1tbGSy+9BFRmnKu1Wt4mN4PKRXJ7RURHRJxJ5ar67YB5EXFvRFwDkJkPADcBDwI/Bc72CnpJUjPq7OzkM5/5DOeccw4RwdFHH83VV1/N2rVrAXjooYd44YUXWLNmDaNGjWLo0KHMnz+fxx57DIDDDjuM2bNn8+KLL/Lcc8/xk5/8pCZ11vIq+lO7WX1dD9tfAlxSq3okSSXRgNkWX3zxRcaOHfvabXKf+MQnOP/88wH41Kc+xYoVKxg3bhyZyciRI5kzZw6nn346H//4x2lvb2fs2LHsvffeAIwbN46JEycyduxY3vGOd/D+97+/JjU7XaxULaeLlRrC6WKdLlaSJBUMeEmSSsiAlyQ1vVY+ndxbfW1zQwa6kfpNPc+LS2qIYcOGsXr1akaMGFGX28uaQWayevVqhg0b1uvPMOAlSU1t9OjRdHR0MNCGJx82bBijR4/u9fsNeElSUxs6dChjxoxpdBktx3PwkiSVkAEvSVIJGfCSJJWQAS9JUgkZ8JIklZABL0lSCRnwkiSVkAEvSVIJGfCSJJWQAS9JUgkZ8JIklZABL0lSCRnwkiSVkAEvSVIJGfCSJJWQAS9JUgkZ8JIklZABL0lSCRnwkiSVkAEvSVIJDWl0ASqh+Zc2ugJJGvDswUuSVEIGvCRJJWTAS5JUQga8JEklVLOAj4jvR8SqiFjaZd3wiJgXEQ8Xj28r1kdEXBERj0TEfRExrlZ1SZI0ENSyB389cMxG6y4A7sjMPYA7imWAjwB7FD9nAVfXsC5JkkqvZgGfmb8C/rTR6uOA6cXz6cDxXdbfkBX/CewQETvXqjZJksqu3ufgd8rMpwCKx1HF+l2AJ7ps11Gse5OIOCsiFkbEws7OzpoWK0lSq2qWi+yim3XZ3YaZOS0z2zOzfeTIkTUuS5Kk1lTvgF+54dB78biqWN8B7Nplu9HAk3WuTZKk0qh3wN8CTC6eTwbmdln/yeJq+oOANRsO5UuSpC1Xs7HoI2IG8EFgx4joAC4CvgHcFBFnAo8DpxSb3wZ8FHgE+AtwRq3qkiRpIKhZwGfmqZt46chutk3g7FrVIknSQNMsF9lJkqR+ZMBLklRCBrwkSSVkwEuSVEIGvCRJJWTAS5JUQga8JEklZMBLklRCBrwkSSVkwEuSVEIGvCRJJWTAS5JUQga8JEklZMBLklRCBrwkSSVkwEuSVEIGvCRJJWTAS5JUQga8JEkltNmAj4itI+LLEfHPxfIeEXFs7UuTJEm9VU0P/gfAy8DBxXIH8Pc1q0iSJPVZNQH/zsz8B2AtQGa+CERNq5IkSX1STcC/EhFbAQkQEe+k0qOXJElNakgV21wE/BTYNSJ+CBwC/PdaFiVJkvpmswGfmfMiYjFwEJVD8+dm5tM1r0ySJPXaJgM+IsZttOqp4nG3iNgtMxfXrixJktQXPfXgv9XDawkc0c+1SJKkfrLJgM/Mw+tZiCRJ6j+bPQcfEcOAzwKHUum53wVck5kv1bg2SZLUS9VcRX8D8BxwZbF8KvB/gVNqVZQkSeqbagJ+r8zcv8vy/Ij4ba0KkiRJfVfNQDdLIuKgDQsR8T7g7r7sNCKmRMQDEbE0ImZExLCIGBMRCyLi4Yi4MSLe0pd9SJI0kFUT8O8D/l9ErIiIFcCvgQ9ExP0Rcd+W7jAidgH+FmjPzPcAg4FJwGXA5Zm5B/Bn4Mwt/WxJklRRzSH6Y2q0360iYi2wNZV77I8ATitenw58Fbi6BvuWJKn0qhnJ7rGI2B94f7Hqrszs9Tn4zPxDRHwTeBx4EbgdWAQ8k5nris06gF16uw9Jkga6am6TOxf4NPDjYtW/RMS0zLyyh7f19HlvA44DxgDPAD8CPtLNprmJ958FnAWw22679aYEqfnNv7R++zp8av32JaluqjlEfybwvsx8ASAiLqNyHr5XAQ98CPh9ZnYWn/dj4K+BHSJiSNGLHw082d2bM3MaMA2gvb292/8ESJI00FVzkV0Ar3ZZfpW+zQf/OHBQRGwdEQEcCTwIzAdOLraZDMztwz4kSRrQqunB/wBYEBGzi+Xjget6u8PMXBARs4DFwDpgCZUe+b8BMyPi74t1vd6HJEkDXTUX2X07Iu6kMlRtAGdk5pK+7DQzL6Iyz3xXjwIH9uVzJUlSRTWH6KFyK9tzmfldoCMixtSwJkmS1EebDfiIuAj4IrDhUtuhwL/UsihJktQ31fTgTwAmAC8AZOaTwHa1LEqSJPVNNQH/SmYmxX3pEbFNbUuSJEl9VU3A3xQR11K5T/3TwM+Bf65tWZIkqS+quYr+mxHxYeBZYC/gK5k5r+aVSZKkXusx4CPieOBdwP2Z+YX6lCRJkvpqk4foI+KfgCnACOBrEfHlulUlSZL6pKce/GHA/pn5akRsDdwFfK0+ZUmSpL7o6SK7VzLzVYDM/At9G39ekiTVUU89+L0j4r7ieQDvLJYDyMzcr+bVSZKkXukp4PepWxWSJKlfbTLgM/OxehYiSZL6T7WTzUiSpBZiwEuSVEI93Qd/R/F4Wf3KkSRJ/aGni+x2jogPABMiYiYb3SaXmYtrWpkkSeq1ngL+K8AFwGjg2xu9lsARtSpKkiT1TU9X0c8CZkXElzPTEewkSWoh1cwm97WImEBl6FqAOzPz1tqWJUmS+mKzV9FHxKXAucCDxc+5xTpJktSkNtuDBz4GjM3M9QARMR1YAkytZWGSJKn3qr0Pfocuz99ai0IkSVL/qaYHfymwJCLmU7lV7jDsvUuS1NSquchuRkTcCYynEvBfzMw/1rowSZLUe9X04MnMp4BbalyLJEnqJ45FL0lSCRnwkiSVUI8BHxGDImJpvYqRJEn9o8eAL+59/21E7FaneiRJUj+o5iK7nYEHIuI3wAsbVmbmhJpVJUmS+qSagL+45lVIkqR+tdmL7DLzl8AKYGjx/B6gT3PBR8QOETErIn4XEcsi4uCIGB4R8yLi4eLxbX3ZhyRJA1k1k818GpgFXFus2gWY08f9fhf4aWbuDewPLKMy9/wdmbkHcEexLEmSeqGa2+TOBg4BngXIzIeBUb3dYURsT2W42+uKz3slM58BjgOmF5tNB47v7T4kSRroqgn4lzPzlQ0LETEEyD7sc3egE/hBRCyJiO9FxDbATsWIeRtGzuv1fyIkSRroqgn4X0bEhcBWEfFh4EfAT/qwzyHAOODqzHwvlSvzqz4cHxFnRcTCiFjY2dnZhzIkSSqvagL+Aio97vuBvwFuA77Uh312AB2ZuaBYnkUl8FdGxM4AxeOq7t6cmdMysz0z20eOHNmHMiRJKq9qZpNbHxHTgQVUDs0vz8xeH6LPzD9GxBMRsVdmLgeOBB4sfiYD3yge5/Z2H5K2wPxL67evw51pWqqXzQZ8RHwMuAb4LyrTxY6JiL/JzH/vw37/J/DDiHgL8ChwBpWjCTdFxJnA48Apffh8SZIGtGoGuvkWcHhmPgIQEe8E/g3odcBn5r1AezcvHdnbz5QkSa+r5hz8qg3hXniUTZwflyRJzWGTPfiIOLF4+kBE3AbcROUc/ClURrOTJElNqqdD9B/v8nwl8IHieSfgMLKSJDWxTQZ8Zp5Rz0IkSVL/qeYq+jFUrnpv67q908VKktS8qrmKfg6VceN/AqyvbTmSJKk/VBPwL2XmFTWvRJIk9ZtqAv67EXERcDvw8oaVmdmnOeElSVLtVBPw+wKfAI7g9UP0WSxLkqQmVE3AnwDs3nXKWGkgunzdSXXb15QhN9dtX5LKqZqR7H4L7FDrQiRJUv+ppge/E/C7iLiHN56D9zY5SZKaVDUBf1HNq5AkSf2qmvngf1mPQiRJUv+pZiS756hcNQ/wFmAo8EJmbl/LwiRJUu9V04PfrutyRBwPHFiziiRJUp9VcxX9G2TmHLwHXpKkplbNIfoTuywOAtp5/ZC9JFVv/qX129fhU+u3L6kJVXMVfdd54dcBK4DjalKNJEnqF9Wcg3deeEmSWswmAz4ivtLD+zIzv1aDeiRJUj/oqQf/QjfrtgHOBEYABrwkSU1qkwGfmd/a8DwitgPOBc4AZgLf2tT7JElS4/V4Dj4ihgPnA6cD04FxmfnnehQmSZJ6r6dz8P8InAhMA/bNzOfrVpUkSeqTnga6+RzwduBLwJMR8Wzx81xEPFuf8iRJUm/0dA5+i0e5kyRJzaGagW4GjnqOsgWOtCVJqhl76ZIklZA9eLW0y9ed1OgSJKkp2YOXJKmEDHhJkkqoYQEfEYMjYklE3Fosj4mIBRHxcETcGBFvaVRtkiS1ukb24M8FlnVZvgy4PDP3AP5MZcx7SZLUCw0J+IgYDXwM+F6xHMARwKxik+nA8Y2oTZKkMmhUD/47wN8B64vlEcAzmbmuWO4AdunujRFxVkQsjIiFnZ2dta9UkqQWVPeAj4hjgVWZuajr6m42ze7en5nTMrM9M9tHjhxZkxolSWp1jbgP/hBgQkR8FBgGbE+lR79DRAwpevGjgScbUJskSaVQ9x58Zk7NzNGZ2QZMAn6RmacD84GTi80mA3PrXZskSWXRTPfBfxE4PyIeoXJO/roG1yNJUstq6FC1mXkncGfx/FHgwEbWI0lSWTRTD16SJPUTA16SpBJyNjmpCdVzlrwpQ26u274k1Y89eEmSSsiAlySphAx4SZJKyICXJKmEDHhJkkrIgJckqYQMeEmSSsiAlySphBzoZqCYf2nddlXPQVokSd2zBy9JUgnZg++i3j3PKXXdmyRpILEHL0lSCdmDl1ROdbzuhMOn1m9fUpXswUuSVEIGvCRJJWTAS5JUQga8JEklZMBLklRCBrwkSSVkwEuSVEIGvCRJJWTAS5JUQga8JEklZMBLklRCjkXfSPUcK1vahHrOojhlyM1125c00NmDlySphAx4SZJKyICXJKmEPAcvSX3l3PNqQnXvwUfErhExPyKWRcQDEXFusX54RMyLiIeLx7fVuzZJksqiEYfo1wGfy8x9gIOAsyPi3cAFwB2ZuQdwR7EsSZJ6oe4Bn5lPZebi4vlzwDJgF+A4YHqx2XTg+HrXJklSWTT0IruIaAPeCywAdsrMp6DynwBg1Cbec1ZELIyIhZ2dnfUqVZKkltKwgI+IbYGbgfMy89lq35eZ0zKzPTPbR44cWbsCJUlqYQ0J+IgYSiXcf5iZPy5Wr4yInYvXdwZWNaI2SZLKoBFX0QdwHbAsM7/d5aVbgMnF88nA3HrXJklSWTTiPvhDgE8A90fEvcW6C4FvADdFxJnA48ApDahNkqRSqHvAZ+Z/ALGJl4+sZy2SJJWVI9lJUitx1DxVybHoJUkqIXvwA0Q95/yWNsW556X6sQcvSVIJ2YOXVEoeLdBAZw9ekqQSMuAlSSohA16SpBIy4CVJKiEDXpKkEjLgJUkqIQNekqQSMuAlSSohA16SpBIy4CVJKiEDXpKkEnIseknqI8e9VzOyBy9JUgnZg28g52iXJNWKPXhJkkrIgJckqYQMeEmSSsiAlySphLzITpJaSF1vyavbnoD5l9ZvX4dPrd++GsgevCRJJWTAS5JUQga8JEkl5Dl4SVL36nleXP3OHrwkSSVkwEuSVEIGvCRJJdR0AR8Rx0TE8oh4JCIuaHQ9kiS1oqYK+IgYDPwf4CPAu4FTI+Ldja1KkqTW02xX0R8IPJKZjwJExEzgOODBhlYlSSqNy+c9VLd9TfnwnnXb18aaqgcP7AI80WW5o1gnSZK2QLP14KObdfmGDSLOAs4qFp+PiOX9sN8dgaf74XOagW1pTralOdmWHpzfnx+2ZWr8e7mwdh+9kfNr05Z3VLNRswV8B7Brl+XRwJNdN8jMacC0/txpRCzMzPb+/MxGsS3NybY0J9vSnGxL/2i2Q/T3AHtExJiIeAswCbilwTVJktRymqoHn5nrIuIc4GfAYOD7mflAg8uSJKnlNFXAA2TmbcBtdd5tvx7ybzDb0pxsS3OyLc3JtvSDyMzNbyVJklpKs52DlyRJ/cCAlySphAZ8wLfy2PcR8f2IWBURS7usGx4R8yLi4eLxbY2ssVoRsWtEzI+IZRHxQEScW6xvufZExLCI+E1E/LZoy8XF+jERsaBoy43FnSItISIGR8SSiLi1WG7JtkTEioi4PyLujYiFxbqW+44BRMQOETErIn5X/N0c3IptiYi9it/Hhp9nI+K8VmwLQERMKf7ul0bEjOLfg4b8vQzogC/B2PfXA8dstO4C4I7M3AO4o1huBeuAz2XmPsBBwNnF76IV2/MycERm7g+MBY6JiIOAy4DLi7b8GTizgTVuqXOBZV2WW7kth2fm2C73Jrfidwzgu8BPM3NvYH8qv5+Wa0tmLi9+H2OBA4C/ALNpwbZExC7A3wLtmfkeKneDTaJRfy+ZOWB/gIOBn3VZngpMbXRdW9iGNmBpl+XlwM7F852B5Y2usZftmgt8uNXbA2wNLAbeR2U0qyHF+jd895r5h8qAU3cARwC3UhlxslXbsgLYcaN1LfcdA7YHfk9xoXQrt2Wj+o8C7m7VtvD6cOvDqdylditwdKP+XgZ0D55yjn2/U2Y+BVA8jmpwPVssItqA9wILaNH2FIe07wVWAfOA/wKeycx1xSat9F37DvB3wPpieQSt25YEbo+IRcWw19Ca37HdgU7gB8Wpk+9FxDa0Zlu6mgTMKJ63XFsy8w/AN4HHgaeANcAiGvT3MtADfrNj36u+ImJb4GbgvMx8ttH19FZmvpqVQ46jqcySuE93m9W3qi0XEccCqzJzUdfV3Wza9G0pHJKZ46icljs7Ig5rdEG9NAQYB1ydme8FXqAFDmH3pDgvPQH4UaNr6a3iOoHjgDHA24FtqHzXNlaXv5eBHvCbHfu+Ba2MiJ0BisdVDa6nahExlEq4/zAzf1ysbtn2AGTmM8CdVK4r2CEiNgwu1SrftUOACRGxAphJ5TD9d2jNtpCZTxaPq6ic5z2Q1vyOdQAdmbmgWJ5FJfBbsS0bfARYnJkri+VWbMuHgN9nZmdmrgV+DPw1Dfp7GegBX8ax728BJhfPJ1M5l930IiKA64BlmfntLi+1XHsiYmRE7FA834rKH/0yYD5wcrFZS7QlM6dm5ujMbKPy9/GLzDydFmxLRGwTEdtteE7lfO9SWvA7lpl/BJ6IiL2KVUcCD9KCbeniVF4/PA+t2ZbHgYMiYuvi37QNv5eG/L0M+JHsIuKjVHokG8a+v6TBJVUtImYAH6QyHeFK4CJgDnATsBuVL9spmfmnRtVYrYg4FLgLuJ/Xz/VeSOU8fEu1JyL2A6ZT+U4NAm7KzP8dEbtT6QUPB5YA/y0zX25cpVsmIj4IfD4zj23FthQ1zy4WhwD/mpmXRMQIWuw7BhARY4HvAW8BHgXOoPi+0Xpt2ZrK9VC7Z+aaYl2r/l4uBiZSuTNoCfApKufc6/73MuADXpKkMhroh+glSSolA16SpBIy4CVJKiEDXpKkEjLgJUkqIQNeUrci4oSIyIjYu9G1SNpyBrykTTkV+A8qA9xIajEGvKQ3KeYEOITKtJaTinWDIuKfirmub42I2yLi5OK1AyLil8UkLj/bMMSopMYx4CV153gqc40/BPwpIsYBJ1KZnnhfKqNzHQyvzSFwJXByZh4AfB9omREhpbIasvlNJA1Ap1IZwhkqQ2yeCgwFfpSZ64E/RsT84vW9gPcA8yrDbzOYylSZkhrIgJf0BsUY4EcA74mIpBLYyevjuL/pLcADmXlwnUqUVAUP0Uva2MnADZn5jsxsy8xdgd8DTwMnFefid6Iy0RHAcmBkRLx2yD4i/qoRhUt6nQEvaWOn8ube+s3A26nMQ74UuJbKTH9rMvMVKv8puCwifgvcS2UObEkN5GxykqoWEdtm5vPFYfzfAIcUc5NLajKeg5e0JW6NiB2ozEH+NcNdal724CVJKiHPwUuSVEIGvCRJJWTAS5JUQga8JEklZMBLklRC/x8V8Dy0JvywRQAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "survived_age = pd.DataFrame(train_dataset.loc[(train_dataset['Survived'] == 1)].Age)\n", "dead_age = pd.DataFrame(train_dataset.loc[(train_dataset['Survived'] == 0)].Age)\n", "ages=pd.concat([survived_age, dead_age], axis=1)\n", "ages.columns = [\"Alive\", \"Dead\"]\n", "\n", "ax = ages.plot(kind='hist', alpha=0.5, stacked=True, bins=15, figsize=(8,5))\n", "ax.set_xlabel(\"Age\")\n", "ax.set_ylabel(\"Number of People\")\n", "\n", "#Drop missing Ages\n", "tra = train_dataset.dropna()\n", "print(\"Pearson Correlation Test = {}\".format(pearsonr(tra[\"Age\"], tra[\"Survived\"])[0]))" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Does Fare have an effect on who survives?" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pearson Correlation Test = 0.13424105283521096\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df1 = pd.DataFrame(train_dataset[[\"Survived\", \"Age\", \"Fare\"]]) \n", "col = df1.Survived.map({0:'r', 1:'b'})\n", "d=df1.plot(x='Age', y='Fare', c=col, kind='scatter', figsize=(8,5))\n", "\n", "d.legend([\"Dead\"])\n", "\n", "# Pearsons Correlation Test\n", "tra = train_dataset.dropna()\n", "print(\"Pearson Correlation Test = {}\".format(pearsonr(tra[\"Fare\"], tra[\"Survived\"])[0]))" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Embarked?" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false, "deletable": true, "editable": true, "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Ryan\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:4619: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " self._update_inplace(new_data)\n", "C:\\Users\\Ryan\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:2530: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.\n", " obj = obj._drop_axis(labels, axis, level=level, errors=errors)\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dropped_ = train_dataset.dropna()\n", "\n", "\n", "dropped_['Embarked'].replace(to_replace=dict(S=0, C=1, Q=2), inplace=True)\n", "dropped_.head()\n", "\n", "df_s = pd.DataFrame(dropped_[[\"Survived\", \"Embarked\"]].loc[(train_dataset['Survived'] == 1)]).groupby(['Embarked']).agg(['count']).reset_index()\n", "df_d = pd.DataFrame(dropped_[[\"Survived\", \"Embarked\"]].loc[(train_dataset['Survived'] == 0)]).groupby(['Embarked']).agg(['count']).reset_index()\n", "\n", "\n", "df_embarked=pd.merge(df_s,df_d,on='Embarked', how='left')\n", "df_embarked.plot(x = \"Embarked\", kind=\"bar\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }