{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "***\n", "***\n", "# Pandas使用简介\n", "使用pandas清洗泰坦尼克数据\n", "***\n", "***\n", "\n" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:14:39.846550", "start_time": "2017-05-18T11:14:39.843323" }, "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# learn more about pandas http://pandas.pydata.org/pandas-docs/stable/indexing.html" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# read data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T10:58:14.510869", "start_time": "2017-05-18T10:58:07.058666" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Import the Pandas library\n", "import pandas as pd\n", "# Load the train and test datasets to create two DataFrames\n", "train_url = \"http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv\"\n", "train = pd.read_csv(train_url)\n", "\n", "test_url = \"http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv\"\n", "test = pd.read_csv(test_url)\n", "#Print the `head` of the train and test dataframes\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T10:58:25.015659", "start_time": "2017-05-18T10:58:24.994400" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T10:59:19.774905", "start_time": "2017-05-18T10:59:19.762068" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "train.to_csv('/Users/chengjun/github/cjc/data/tatanic_train.csv')\n", "test.to_csv('/Users/chengjun/github/cjc/data/tatanic_test.csv')" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "You can easily explore a DataFrame \n", " - .describe() summarizes the columns/features of the DataFrame, including the count of observations, mean, max and so on. \n", " - Another useful trick is to look at the dimensions of the DataFrame. This is done by requesting the .shape attribute of your DataFrame object. (ex. your_data.shape)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T10:59:57.843346", "start_time": "2017-05-18T10:59:57.838260" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "(891, 12)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T10:59:49.526000", "start_time": "2017-05-18T10:59:49.462373" }, "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassAgeSibSpParchFare
count891.000000891.000000891.000000714.000000891.000000891.000000891.000000
mean446.0000000.3838382.30864229.6991180.5230080.38159432.204208
std257.3538420.4865920.83607114.5264971.1027430.80605749.693429
min1.0000000.0000001.0000000.4200000.0000000.0000000.000000
25%223.5000000.0000002.00000020.1250000.0000000.0000007.910400
50%446.0000000.0000003.00000028.0000000.0000000.00000014.454200
75%668.5000001.0000003.00000038.0000001.0000000.00000031.000000
max891.0000001.0000003.00000080.0000008.0000006.000000512.329200
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass Age SibSp \\\n", "count 891.000000 891.000000 891.000000 714.000000 891.000000 \n", "mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n", "std 257.353842 0.486592 0.836071 14.526497 1.102743 \n", "min 1.000000 0.000000 1.000000 0.420000 0.000000 \n", "25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n", "50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n", "75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n", "max 891.000000 1.000000 3.000000 80.000000 8.000000 \n", "\n", " Parch Fare \n", "count 891.000000 891.000000 \n", "mean 0.381594 32.204208 \n", "std 0.806057 49.693429 \n", "min 0.000000 0.000000 \n", "25% 0.000000 7.910400 \n", "50% 0.000000 14.454200 \n", "75% 0.000000 31.000000 \n", "max 6.000000 512.329200 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.describe()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:00:14.965414", "start_time": "2017-05-18T11:00:14.958460" }, "collapsed": true, "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "train = pd.read_csv('/Users/chengjun/github/cjc/data/tatanic_train.csv',\\\n", " sep = \",\", header=0)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:00:24.611152", "start_time": "2017-05-18T11:00:24.591049" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
00103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
11211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
22313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
\n", "
" ], "text/plain": [ " Unnamed: 0 PassengerId Survived Pclass \\\n", "0 0 1 0 3 \n", "1 1 2 1 1 \n", "2 2 3 1 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train[:3]" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Selecting data" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:08:35.416956", "start_time": "2017-05-18T11:08:35.411141" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 1\n", "2 1\n", "3 1\n", "4 0\n", "Name: Survived, dtype: int64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Passengers that survived vs passengers that passed away\n", "train[\"Survived\"][:5]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:01:08.825673", "start_time": "2017-05-18T11:01:08.819230" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "0 549\n", "1 342\n", "Name: Survived, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Passengers that survived vs passengers that passed away\n", "train[\"Survived\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:01:18.545677", "start_time": "2017-05-18T11:01:18.539134" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "0 0.616162\n", "1 0.383838\n", "Name: Survived, dtype: float64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# As proportions\n", "train[\"Survived\"].value_counts(normalize = True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:01:26.497032", "start_time": "2017-05-18T11:01:26.490532" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "male 577\n", "female 314\n", "Name: Sex, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train['Sex'].value_counts()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:01:33.947389", "start_time": "2017-05-18T11:01:33.936907" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "0 468\n", "1 109\n", "Name: Survived, dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Males that survived vs males that passed away\n", "train[\"Survived\"][train[\"Sex\"] == 'male'].value_counts()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:01:41.372036", "start_time": "2017-05-18T11:01:41.362275" }, "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "1 233\n", "0 81\n", "Name: Survived, dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Females that survived vs Females that passed away\n", "train[\"Survived\"][train[\"Sex\"] == 'female'].value_counts()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:01:51.099388", "start_time": "2017-05-18T11:01:51.088912" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "0 0.811092\n", "1 0.188908\n", "Name: Survived, dtype: float64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Normalized male survival\n", "train[\"Survived\"][train[\"Sex\"] == 'male'].value_counts(normalize = True) " ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:02:00.483327", "start_time": "2017-05-18T11:02:00.473494" }, "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "1 0.742038\n", "0 0.257962\n", "Name: Survived, dtype: float64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Normalized female survival\n", "train[\"Survived\"][train[\"Sex\"] == 'female'].value_counts(normalize = True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:02:07.983544", "start_time": "2017-05-18T11:02:07.921434" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 0.0\n", "1 0.0\n", "2 0.0\n", "Name: Child, dtype: float64\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " app.launch_new_instance()\n", "/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" ] } ], "source": [ "# Create the column Child, and indicate whether child or not a child. Print the new column.\n", "train[\"Child\"] = float('NaN')\n", "train.Child[train.Age < 5] = 1\n", "train.Child[train.Age >= 5] = 0\n", "print train.Child[:3]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:02:22.777516", "start_time": "2017-05-18T11:02:22.768363" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "1 0.675\n", "0 0.325\n", "Name: Survived, dtype: float64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Normalized Survival Rates for under 18\n", "train.Survived[train.Child == 1].value_counts(normalize = True)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:02:30.579362", "start_time": "2017-05-18T11:02:30.568076" }, "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "data": { "text/plain": [ "0 0.609792\n", "1 0.390208\n", "Name: Survived, dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Normalized Survival Rates for over 18\n", "train.Survived[train.Child == 0].value_counts(normalize = True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:02:40.742798", "start_time": "2017-05-18T11:02:40.713399" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 0\n", "1 1\n", "2 0\n", "Name: Survived, dtype: int64\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" ] } ], "source": [ "# Create a copy of test: test_one\n", "test_one = test\n", "# Initialize a Survived column to 0\n", "test_one['Survived'] = 0\n", "# Set Survived to 1 if Sex equals \"female\" and print the `Survived` column from `test_one`\n", "test_one.Survived[test_one.Sex =='female'] = 1\n", "\n", "print test_one.Survived[:3]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:02:53.491413", "start_time": "2017-05-18T11:02:53.375872" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " from ipykernel import kernelapp as app\n", "/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " app.launch_new_instance()\n", "/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", "/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", "/Users/chengjun/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" ] } ], "source": [ "#Convert the male and female groups to integer form\n", "train[\"Sex\"][train[\"Sex\"] == \"male\"] = 0\n", "train[\"Sex\"][train[\"Sex\"] == \"female\"] = 1\n", "\n", "#Impute the Embarked variable\n", "train[\"Embarked\"] = train[\"Embarked\"].fillna('S')\n", "\n", "#Convert the Embarked classes to integer form\n", "train[\"Embarked\"][train[\"Embarked\"] == \"S\"] = 0\n", "train[\"Embarked\"][train[\"Embarked\"] == \"C\"] = 1\n", "train[\"Embarked\"][train[\"Embarked\"] == \"Q\"] = 2" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:03:36.728445", "start_time": "2017-05-18T11:03:36.725475" }, "slideshow": { "slide_type": "slide" } }, "source": [ "# sort_values" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:04:22.488567", "start_time": "2017-05-18T11:04:22.464544" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedChild
80380380413Thomas, Master. Assad Alexander00.420126258.5167NaN11.0
75575575612Hamalainen, Master. Viljo00.671125064914.5000NaN01.0
64464464513Baclini, Miss. Eugenie10.7521266619.2583NaN11.0
46946947013Baclini, Miss. Helene Barbara10.7521266619.2583NaN11.0
78787912Caldwell, Master. Alden Gates00.830224873829.0000NaN01.0
\n", "
" ], "text/plain": [ " Unnamed: 0 PassengerId Survived Pclass \\\n", "803 803 804 1 3 \n", "755 755 756 1 2 \n", "644 644 645 1 3 \n", "469 469 470 1 3 \n", "78 78 79 1 2 \n", "\n", " Name Sex Age SibSp Parch Ticket Fare \\\n", "803 Thomas, Master. Assad Alexander 0 0.42 0 1 2625 8.5167 \n", "755 Hamalainen, Master. Viljo 0 0.67 1 1 250649 14.5000 \n", "644 Baclini, Miss. Eugenie 1 0.75 2 1 2666 19.2583 \n", "469 Baclini, Miss. Helene Barbara 1 0.75 2 1 2666 19.2583 \n", "78 Caldwell, Master. Alden Gates 0 0.83 0 2 248738 29.0000 \n", "\n", " Cabin Embarked Child \n", "803 NaN 1 1.0 \n", "755 NaN 0 1.0 \n", "644 NaN 1 1.0 \n", "469 NaN 1 1.0 \n", "78 NaN 0 1.0 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.sort_values(by = ['Age'])[:5]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:04:35.307577", "start_time": "2017-05-18T11:04:35.285188" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedChild
63063063111Barkworth, Mr. Algernon Henry Wilson080.0002704230.0000A2300.0
85185185203Svensson, Mr. Johan074.0003470607.7750NaN00.0
49349349401Artagaveytia, Mr. Ramon071.000PC 1760949.5042NaN10.0
96969701Goldschmidt, Mr. George B071.000PC 1775434.6542A510.0
11611611703Connors, Mr. Patrick070.5003703697.7500NaN20.0
\n", "
" ], "text/plain": [ " Unnamed: 0 PassengerId Survived Pclass \\\n", "630 630 631 1 1 \n", "851 851 852 0 3 \n", "493 493 494 0 1 \n", "96 96 97 0 1 \n", "116 116 117 0 3 \n", "\n", " Name Sex Age SibSp Parch Ticket \\\n", "630 Barkworth, Mr. Algernon Henry Wilson 0 80.0 0 0 27042 \n", "851 Svensson, Mr. Johan 0 74.0 0 0 347060 \n", "493 Artagaveytia, Mr. Ramon 0 71.0 0 0 PC 17609 \n", "96 Goldschmidt, Mr. George B 0 71.0 0 0 PC 17754 \n", "116 Connors, Mr. Patrick 0 70.5 0 0 370369 \n", "\n", " Fare Cabin Embarked Child \n", "630 30.0000 A23 0 0.0 \n", "851 7.7750 NaN 0 0.0 \n", "493 49.5042 NaN 1 0.0 \n", "96 34.6542 A5 1 0.0 \n", "116 7.7500 NaN 2 0.0 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.sort_values(by = ['Age'], ascending = False)[:5]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:07:18.231407", "start_time": "2017-05-18T11:07:18.209109" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedChild
85185185203Svensson, Mr. Johan074.0003470607.7750NaN00.0
11611611703Connors, Mr. Patrick070.5003703697.7500NaN20.0
28028028103Duane, Mr. Frank065.0003364397.7500NaN20.0
48348348413Turkula, Mrs. (Hedwig)163.00041349.5875NaN00.0
32632632703Nysveen, Mr. Johan Hansen061.0003453646.2375NaN00.0
\n", "
" ], "text/plain": [ " Unnamed: 0 PassengerId Survived Pclass Name Sex \\\n", "851 851 852 0 3 Svensson, Mr. Johan 0 \n", "116 116 117 0 3 Connors, Mr. Patrick 0 \n", "280 280 281 0 3 Duane, Mr. Frank 0 \n", "483 483 484 1 3 Turkula, Mrs. (Hedwig) 1 \n", "326 326 327 0 3 Nysveen, Mr. Johan Hansen 0 \n", "\n", " Age SibSp Parch Ticket Fare Cabin Embarked Child \n", "851 74.0 0 0 347060 7.7750 NaN 0 0.0 \n", "116 70.5 0 0 370369 7.7500 NaN 2 0.0 \n", "280 65.0 0 0 336439 7.7500 NaN 2 0.0 \n", "483 63.0 0 0 4134 9.5875 NaN 0 0.0 \n", "326 61.0 0 0 345364 6.2375 NaN 0 0.0 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.sort_values(by = ['Pclass', 'Age'], ascending = False)[:5]" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# groupby" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:09:17.772900", "start_time": "2017-05-18T11:09:17.752055" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PassengerIdSurvivedPclassAgeParchFareChild
SibSp
0276257276865210143014788.2511315620.75309.0
191694919031124305506.421379226.800415.0
211520115481366565.50181449.10414.0
351295145441167.00211102.54184.0
468516869354127.0027573.40007.0
51679168401551.0010234.50001.0
833653372021NaN14486.8500NaN
\n", "
" ], "text/plain": [ " Unnamed: 0 PassengerId Survived Pclass Age Parch Fare \\\n", "SibSp \n", "0 276257 276865 210 1430 14788.25 113 15620.7530 \n", "1 91694 91903 112 430 5506.42 137 9226.8004 \n", "2 11520 11548 13 66 565.50 18 1449.1041 \n", "3 5129 5145 4 41 167.00 21 1102.5418 \n", "4 6851 6869 3 54 127.00 27 573.4000 \n", "5 1679 1684 0 15 51.00 10 234.5000 \n", "8 3365 3372 0 21 NaN 14 486.8500 \n", "\n", " Child \n", "SibSp \n", "0 9.0 \n", "1 15.0 \n", "2 4.0 \n", "3 4.0 \n", "4 7.0 \n", "5 1.0 \n", "8 NaN " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.groupby(['SibSp']).sum()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2017-05-18T11:10:54.735809", "start_time": "2017-05-18T11:10:54.707847" }, "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PassengerIdPclassAgeParchFareChild
SibSpSurvived
0018390218430010049672.50436611.60711.0
192355925654265115.75709009.14598.0
1040533406302322739.00713256.17101.0
151161512731982767.42665970.629414.0
205206522139364.005511.20000.0
16314632727201.5013937.90414.0
30388939013457.0017539.69184.0
1124012447110.004562.85000.0
406289630445102.0021502.70006.0
1562565925.00670.70001.0
50167916841551.0010234.50001.0
803365337221NaN14486.8500NaN
\n", "
" ], "text/plain": [ " Unnamed: 0 PassengerId Pclass Age Parch Fare \\\n", "SibSp Survived \n", "0 0 183902 184300 1004 9672.50 43 6611.6071 \n", " 1 92355 92565 426 5115.75 70 9009.1459 \n", "1 0 40533 40630 232 2739.00 71 3256.1710 \n", " 1 51161 51273 198 2767.42 66 5970.6294 \n", "2 0 5206 5221 39 364.00 5 511.2000 \n", " 1 6314 6327 27 201.50 13 937.9041 \n", "3 0 3889 3901 34 57.00 17 539.6918 \n", " 1 1240 1244 7 110.00 4 562.8500 \n", "4 0 6289 6304 45 102.00 21 502.7000 \n", " 1 562 565 9 25.00 6 70.7000 \n", "5 0 1679 1684 15 51.00 10 234.5000 \n", "8 0 3365 3372 21 NaN 14 486.8500 \n", "\n", " Child \n", "SibSp Survived \n", "0 0 1.0 \n", " 1 8.0 \n", "1 0 1.0 \n", " 1 14.0 \n", "2 0 0.0 \n", " 1 4.0 \n", "3 0 4.0 \n", " 1 0.0 \n", "4 0 6.0 \n", " 1 1.0 \n", "5 0 1.0 \n", "8 0 NaN " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.groupby(['SibSp', 'Survived']).sum()" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "learn more about pandas http://pandas.pydata.org/pandas-docs/stable/indexing.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" }, "latex_envs": { "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 0 }, "toc": { "toc_cell": false, "toc_number_sections": false, "toc_section_display": "none", "toc_threshold": 6, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 0 }