{"cells":[{"cell_type":"code","source":"from google.colab import drive\nimport os\ndrive.mount('/content/gdrive')\n# Establecer ruta de acceso en drive\nimport os\nprint(os.getcwd())\nos.chdir(\"/content/gdrive/My Drive\")","metadata":{"id":"j2626UpFhsvN","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"9843b2ca1bcd4ff3970c38b0de626e7b","outputId":"3bd1ed8a-1d5d-4696-ac38-f25cb4e95bd9","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":3774,"user_tz":180,"timestamp":1650297871775},"deepnote_cell_type":"code"},"outputs":[{"output_type":"stream","name":"stdout","text":"Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n/content/gdrive/My Drive\n"}],"execution_count":97},{"cell_type":"code","source":"import pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\ntrain=pd.read_csv('train_titanic.csv')\ntest=pd.read_csv('test_titanic.csv')\ntest","metadata":{"id":"oj-XQ1jxiGt4","colab":{"height":423,"base_uri":"https://localhost:8080/"},"cell_id":"f4f3d2aab6eb42fa83e583803627deba","outputId":"0c469fb3-ab29-4c5a-ad16-74e382fc304e","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":26,"user_tz":180,"timestamp":1650297871777},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" PassengerId Pclass Name \\\n0 892 3 Kelly, Mr. James \n1 893 3 Wilkes, Mrs. James (Ellen Needs) \n2 894 2 Myles, Mr. Thomas Francis \n3 895 3 Wirz, Mr. Albert \n4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) \n.. ... ... ... \n413 1305 3 Spector, Mr. Woolf \n414 1306 1 Oliva y Ocana, Dona. Fermina \n415 1307 3 Saether, Mr. Simon Sivertsen \n416 1308 3 Ware, Mr. Frederick \n417 1309 3 Peter, Master. Michael J \n\n Sex Age SibSp Parch Ticket Fare Cabin Embarked \n0 male 34.5 0 0 330911 7.8292 NaN Q \n1 female 47.0 1 0 363272 7.0000 NaN S \n2 male 62.0 0 0 240276 9.6875 NaN Q \n3 male 27.0 0 0 315154 8.6625 NaN S \n4 female 22.0 1 1 3101298 12.2875 NaN S \n.. ... ... ... ... ... ... ... ... \n413 male NaN 0 0 A.5. 3236 8.0500 NaN S \n414 female 39.0 0 0 PC 17758 108.9000 C105 C \n415 male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S \n416 male NaN 0 0 359309 8.0500 NaN S \n417 male NaN 1 1 2668 22.3583 NaN C \n\n[418 rows x 11 columns]","text/html":"\n
\n
\n
\n\n
\n \n
\n
\n
PassengerId
\n
Pclass
\n
Name
\n
Sex
\n
Age
\n
SibSp
\n
Parch
\n
Ticket
\n
Fare
\n
Cabin
\n
Embarked
\n
\n \n \n
\n
0
\n
892
\n
3
\n
Kelly, Mr. James
\n
male
\n
34.5
\n
0
\n
0
\n
330911
\n
7.8292
\n
NaN
\n
Q
\n
\n
\n
1
\n
893
\n
3
\n
Wilkes, Mrs. James (Ellen Needs)
\n
female
\n
47.0
\n
1
\n
0
\n
363272
\n
7.0000
\n
NaN
\n
S
\n
\n
\n
2
\n
894
\n
2
\n
Myles, Mr. Thomas Francis
\n
male
\n
62.0
\n
0
\n
0
\n
240276
\n
9.6875
\n
NaN
\n
Q
\n
\n
\n
3
\n
895
\n
3
\n
Wirz, Mr. Albert
\n
male
\n
27.0
\n
0
\n
0
\n
315154
\n
8.6625
\n
NaN
\n
S
\n
\n
\n
4
\n
896
\n
3
\n
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
\n
female
\n
22.0
\n
1
\n
1
\n
3101298
\n
12.2875
\n
NaN
\n
S
\n
\n
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
\n
\n
413
\n
1305
\n
3
\n
Spector, Mr. Woolf
\n
male
\n
NaN
\n
0
\n
0
\n
A.5. 3236
\n
8.0500
\n
NaN
\n
S
\n
\n
\n
414
\n
1306
\n
1
\n
Oliva y Ocana, Dona. Fermina
\n
female
\n
39.0
\n
0
\n
0
\n
PC 17758
\n
108.9000
\n
C105
\n
C
\n
\n
\n
415
\n
1307
\n
3
\n
Saether, Mr. Simon Sivertsen
\n
male
\n
38.5
\n
0
\n
0
\n
SOTON/O.Q. 3101262
\n
7.2500
\n
NaN
\n
S
\n
\n
\n
416
\n
1308
\n
3
\n
Ware, Mr. Frederick
\n
male
\n
NaN
\n
0
\n
0
\n
359309
\n
8.0500
\n
NaN
\n
S
\n
\n
\n
417
\n
1309
\n
3
\n
Peter, Master. Michael J
\n
male
\n
NaN
\n
1
\n
1
\n
2668
\n
22.3583
\n
NaN
\n
C
\n
\n \n
\n
418 rows × 11 columns
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":98}],"execution_count":98},{"cell_type":"markdown","source":"Variable | Descripcion\n-------------------|------------------\nSurvived|\tSurvived (1) or died (0)\nPclass\t|Passenger’s class\nName\t|Passenger’s name\nSex\t|Passenger’s sex\nAge\t|Passenger’s age\nSibSp\t|Number of siblings/spouses aboard\nParch\t|Number of parents/children aboard\nTicket\t|Ticket number\nFare\t|Fare\nCabin\t|Cabin\nEmbarked\t|Port of embarkation","metadata":{"id":"7IKUg4AiqUEe","cell_id":"305735206504454a9aa18c17c3a0aaf0","deepnote_cell_type":"markdown"}},{"cell_type":"markdown","source":"# Feature Engineering","metadata":{"id":"GjY-wdXUVY-E","cell_id":"9ba45f99eba7436e8b9fc6e9febfa4a3","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"full= pd.concat([train,test], axis=0)\nfull.head()","metadata":{"id":"eHmQy8Dvq8rn","colab":{"height":206,"base_uri":"https://localhost:8080/"},"cell_id":"20d84e0693944b648bbb7e13f4a30043","outputId":"df76faa1-553c-40af-8030-5abd5c7b7dc7","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":419,"user_tz":180,"timestamp":1650297876806},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" PassengerId Survived Pclass \\\n0 1 0.0 3 \n1 2 1.0 1 \n2 3 1.0 3 \n3 4 1.0 1 \n4 5 0.0 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n\n Parch Ticket Fare Cabin Embarked \n0 0 A/5 21171 7.2500 NaN S \n1 0 PC 17599 71.2833 C85 C \n2 0 STON/O2. 3101282 7.9250 NaN S \n3 0 113803 53.1000 C123 S \n4 0 373450 8.0500 NaN S ","text/html":"\n
\n
\n
\n\n
\n \n
\n
\n
PassengerId
\n
Survived
\n
Pclass
\n
Name
\n
Sex
\n
Age
\n
SibSp
\n
Parch
\n
Ticket
\n
Fare
\n
Cabin
\n
Embarked
\n
\n \n \n
\n
0
\n
1
\n
0.0
\n
3
\n
Braund, Mr. Owen Harris
\n
male
\n
22.0
\n
1
\n
0
\n
A/5 21171
\n
7.2500
\n
NaN
\n
S
\n
\n
\n
1
\n
2
\n
1.0
\n
1
\n
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n
female
\n
38.0
\n
1
\n
0
\n
PC 17599
\n
71.2833
\n
C85
\n
C
\n
\n
\n
2
\n
3
\n
1.0
\n
3
\n
Heikkinen, Miss. Laina
\n
female
\n
26.0
\n
0
\n
0
\n
STON/O2. 3101282
\n
7.9250
\n
NaN
\n
S
\n
\n
\n
3
\n
4
\n
1.0
\n
1
\n
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n
female
\n
35.0
\n
1
\n
0
\n
113803
\n
53.1000
\n
C123
\n
S
\n
\n
\n
4
\n
5
\n
0.0
\n
3
\n
Allen, Mr. William Henry
\n
male
\n
35.0
\n
0
\n
0
\n
373450
\n
8.0500
\n
NaN
\n
S
\n
\n \n
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":99}],"execution_count":99},{"cell_type":"code","source":"# Exploraremos la columna passenger name y nos interesa la abreviatura de cada uno\nimport re\nfull['Title']=full['Name'].apply(lambda x : re.sub(\"(.*, )|(\\\\..*)\", \"\", x)) \nfull.head()","metadata":{"id":"-wtvYTKIsJBh","colab":{"height":206,"base_uri":"https://localhost:8080/"},"cell_id":"871ccd9dd15345958101d1f325c84fbc","outputId":"77ba6fb2-be1d-4067-b35d-76587c6360e2","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":18,"user_tz":180,"timestamp":1650297901127},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" PassengerId Survived Pclass \\\n0 1 0.0 3 \n1 2 1.0 1 \n2 3 1.0 3 \n3 4 1.0 1 \n4 5 0.0 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n\n Parch Ticket Fare Cabin Embarked Title \n0 0 A/5 21171 7.2500 NaN S Mr \n1 0 PC 17599 71.2833 C85 C Mrs \n2 0 STON/O2. 3101282 7.9250 NaN S Miss \n3 0 113803 53.1000 C123 S Mrs \n4 0 373450 8.0500 NaN S Mr ","text/html":"\n
\n
\n
\n\n
\n \n
\n
\n
PassengerId
\n
Survived
\n
Pclass
\n
Name
\n
Sex
\n
Age
\n
SibSp
\n
Parch
\n
Ticket
\n
Fare
\n
Cabin
\n
Embarked
\n
Title
\n
\n \n \n
\n
0
\n
1
\n
0.0
\n
3
\n
Braund, Mr. Owen Harris
\n
male
\n
22.0
\n
1
\n
0
\n
A/5 21171
\n
7.2500
\n
NaN
\n
S
\n
Mr
\n
\n
\n
1
\n
2
\n
1.0
\n
1
\n
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n
female
\n
38.0
\n
1
\n
0
\n
PC 17599
\n
71.2833
\n
C85
\n
C
\n
Mrs
\n
\n
\n
2
\n
3
\n
1.0
\n
3
\n
Heikkinen, Miss. Laina
\n
female
\n
26.0
\n
0
\n
0
\n
STON/O2. 3101282
\n
7.9250
\n
NaN
\n
S
\n
Miss
\n
\n
\n
3
\n
4
\n
1.0
\n
1
\n
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n
female
\n
35.0
\n
1
\n
0
\n
113803
\n
53.1000
\n
C123
\n
S
\n
Mrs
\n
\n
\n
4
\n
5
\n
0.0
\n
3
\n
Allen, Mr. William Henry
\n
male
\n
35.0
\n
0
\n
0
\n
373450
\n
8.0500
\n
NaN
\n
S
\n
Mr
\n
\n \n
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":100}],"execution_count":100},{"cell_type":"code","source":"pd.crosstab(full.Sex, full.Title)","metadata":{"id":"UZ1Uu2C1sBc3","colab":{"height":143,"base_uri":"https://localhost:8080/"},"cell_id":"9fd856110eb74fef85874f8c8295cac4","outputId":"b5d2c5a2-7aaf-4c04-88f3-4c7920ae4b8d","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":849,"user_tz":180,"timestamp":1650297922960},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"Title Capt Col Don Dona Dr Jonkheer Lady Major Master Miss Mlle \\\nSex \nfemale 0 0 0 1 1 0 1 0 0 260 2 \nmale 1 4 1 0 7 1 0 2 61 0 0 \n\nTitle Mme Mr Mrs Ms Rev Sir the Countess \nSex \nfemale 1 0 197 2 0 0 1 \nmale 0 757 0 0 8 1 0 ","text/html":"\n
\n "},"metadata":{},"execution_count":102}],"execution_count":102},{"cell_type":"code","source":"# Extrayendo el apellido\nfull['Surname']=full['Name'].apply(lambda x : x[:x.index(',')])\nfull.head()","metadata":{"id":"3XIOqEcVcn5J","colab":{"height":206,"base_uri":"https://localhost:8080/"},"cell_id":"daa05e9a6a7b48919d21b7a82284f881","outputId":"5bc83ca7-7434-49ad-b452-a313940b6707","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":369,"user_tz":180,"timestamp":1650297934276},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" PassengerId Survived Pclass \\\n0 1 0.0 3 \n1 2 1.0 1 \n2 3 1.0 3 \n3 4 1.0 1 \n4 5 0.0 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n\n Parch Ticket Fare Cabin Embarked Title Surname \n0 0 A/5 21171 7.2500 NaN S Mr Braund \n1 0 PC 17599 71.2833 C85 C Mrs Cumings \n2 0 STON/O2. 3101282 7.9250 NaN S Miss Heikkinen \n3 0 113803 53.1000 C123 S Mrs Futrelle \n4 0 373450 8.0500 NaN S Mr Allen ","text/html":"\n
\n
\n
\n\n
\n \n
\n
\n
PassengerId
\n
Survived
\n
Pclass
\n
Name
\n
Sex
\n
Age
\n
SibSp
\n
Parch
\n
Ticket
\n
Fare
\n
Cabin
\n
Embarked
\n
Title
\n
Surname
\n
\n \n \n
\n
0
\n
1
\n
0.0
\n
3
\n
Braund, Mr. Owen Harris
\n
male
\n
22.0
\n
1
\n
0
\n
A/5 21171
\n
7.2500
\n
NaN
\n
S
\n
Mr
\n
Braund
\n
\n
\n
1
\n
2
\n
1.0
\n
1
\n
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n
female
\n
38.0
\n
1
\n
0
\n
PC 17599
\n
71.2833
\n
C85
\n
C
\n
Mrs
\n
Cumings
\n
\n
\n
2
\n
3
\n
1.0
\n
3
\n
Heikkinen, Miss. Laina
\n
female
\n
26.0
\n
0
\n
0
\n
STON/O2. 3101282
\n
7.9250
\n
NaN
\n
S
\n
Miss
\n
Heikkinen
\n
\n
\n
3
\n
4
\n
1.0
\n
1
\n
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n
female
\n
35.0
\n
1
\n
0
\n
113803
\n
53.1000
\n
C123
\n
S
\n
Mrs
\n
Futrelle
\n
\n
\n
4
\n
5
\n
0.0
\n
3
\n
Allen, Mr. William Henry
\n
male
\n
35.0
\n
0
\n
0
\n
373450
\n
8.0500
\n
NaN
\n
S
\n
Mr
\n
Allen
\n
\n \n
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":103}],"execution_count":103},{"cell_type":"markdown","source":"## Las familias se hundieron o nadaron juntos\n\nPrimero vamos a hacer una variable del tamaño de la familia basada en el número de hermanos/cónyuge(s) (¿quizás alguien tiene más de un cónyuge?) y el número de hijos/padres.","metadata":{"id":"wh1WDhe0wL4n","cell_id":"ef3bad66940d4d658a0cb4873aeb6c58","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"full['Fsize']=full['SibSp']+full['Parch']+1\nfull['Fsize']=full['Fsize'].astype('str')\nfull['Family']=full[['Surname', 'Fsize']].agg('_'.join, axis=1)\nfull.head()","metadata":{"id":"zlgi5Fm5wWgT","colab":{"height":206,"base_uri":"https://localhost:8080/"},"cell_id":"75d8dcd999d44275b9a22f0390747060","outputId":"e302d860-566e-40bb-8f13-4de910c8274d","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":552,"user_tz":180,"timestamp":1650297939484},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" PassengerId Survived Pclass \\\n0 1 0.0 3 \n1 2 1.0 1 \n2 3 1.0 3 \n3 4 1.0 1 \n4 5 0.0 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n\n Parch Ticket Fare Cabin Embarked Title Surname Fsize \\\n0 0 A/5 21171 7.2500 NaN S Mr Braund 2 \n1 0 PC 17599 71.2833 C85 C Mrs Cumings 2 \n2 0 STON/O2. 3101282 7.9250 NaN S Miss Heikkinen 1 \n3 0 113803 53.1000 C123 S Mrs Futrelle 2 \n4 0 373450 8.0500 NaN S Mr Allen 1 \n\n Family \n0 Braund_2 \n1 Cumings_2 \n2 Heikkinen_1 \n3 Futrelle_2 \n4 Allen_1 ","text/html":"\n
\n "},"metadata":{},"execution_count":109}],"execution_count":109},{"cell_type":"code","source":"# algunos graficos descriptivos\nplt.figure(figsize=(10,7))\nsns.barplot(y=\"F\", x=\"Fsize\", data=tr,hue='Survived', orient='v')\nplt.xlabel('Family size')\nplt.ylabel('Conteo')\nplt.legend(loc='upper right')","metadata":{"id":"VGjLRmmedNRD","colab":{"height":458,"base_uri":"https://localhost:8080/"},"cell_id":"608e76f3924747f3af10c39fa63108c3","outputId":"5065d9df-8901-43f1-a16c-46de8781cf8e","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":28,"user_tz":180,"timestamp":1650298287958},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":""},"metadata":{},"execution_count":110},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}],"execution_count":110},{"cell_type":"markdown","source":"Podemos ver que hay una penalización de supervivencia para los hijos únicos y aquellos con familias de más de 4. Podemos colapsar esta variable en tres niveles que serán útiles ya que hay comparativamente menos familias numerosas. Vamos a crear una variable de tamaño de familia discretizada.","metadata":{"id":"rFUqpskJhjwn","cell_id":"abe62881e7de43829aa901f066ad39ca","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"full['Fsize'].unique()","metadata":{"id":"2qcThq0UiWYA","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"fb78353a53ff4ab0a16d453449887767","outputId":"72422d06-b7c0-4d0e-849f-c1b1c55f4a94","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":311,"user_tz":180,"timestamp":1650298296050},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"array(['2', '1', '5', '3', '7', '6', '4', '8', '11'], dtype=object)"},"metadata":{},"execution_count":111}],"execution_count":111},{"cell_type":"code","source":"# Discretizar la variable family size\nfull['Fsize']=full['Fsize'].astype('int')\nfull['FsizeD']=np.where(full.Fsize ==1, 'singleton',# aqui viene el else\n np.where(((full.Fsize <5) & (full.Fsize >1)),'small','large'))\nfull.head()","metadata":{"id":"iTH-5cqMhm6P","colab":{"height":206,"base_uri":"https://localhost:8080/"},"cell_id":"ab80d7259da34a6d910562233f79dbe5","outputId":"0362a46d-34cb-4f3b-b4ca-728ee46f8855","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":602,"user_tz":180,"timestamp":1650298297982},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" PassengerId Survived Pclass \\\n0 1 0.0 3 \n1 2 1.0 1 \n2 3 1.0 3 \n3 4 1.0 1 \n4 5 0.0 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n\n Parch Ticket Fare Cabin Embarked Title Surname Fsize \\\n0 0 A/5 21171 7.2500 NaN S Mr Braund 2 \n1 0 PC 17599 71.2833 C85 C Mrs Cumings 2 \n2 0 STON/O2. 3101282 7.9250 NaN S Miss Heikkinen 1 \n3 0 113803 53.1000 C123 S Mrs Futrelle 2 \n4 0 373450 8.0500 NaN S Mr Allen 1 \n\n Family FsizeD \n0 Braund_2 small \n1 Cumings_2 small \n2 Heikkinen_1 singleton \n3 Futrelle_2 small \n4 Allen_1 singleton ","text/html":"\n
\n
\n
\n\n
\n \n
\n
\n
PassengerId
\n
Survived
\n
Pclass
\n
Name
\n
Sex
\n
Age
\n
SibSp
\n
Parch
\n
Ticket
\n
Fare
\n
Cabin
\n
Embarked
\n
Title
\n
Surname
\n
Fsize
\n
Family
\n
FsizeD
\n
\n \n \n
\n
0
\n
1
\n
0.0
\n
3
\n
Braund, Mr. Owen Harris
\n
male
\n
22.0
\n
1
\n
0
\n
A/5 21171
\n
7.2500
\n
NaN
\n
S
\n
Mr
\n
Braund
\n
2
\n
Braund_2
\n
small
\n
\n
\n
1
\n
2
\n
1.0
\n
1
\n
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n
female
\n
38.0
\n
1
\n
0
\n
PC 17599
\n
71.2833
\n
C85
\n
C
\n
Mrs
\n
Cumings
\n
2
\n
Cumings_2
\n
small
\n
\n
\n
2
\n
3
\n
1.0
\n
3
\n
Heikkinen, Miss. Laina
\n
female
\n
26.0
\n
0
\n
0
\n
STON/O2. 3101282
\n
7.9250
\n
NaN
\n
S
\n
Miss
\n
Heikkinen
\n
1
\n
Heikkinen_1
\n
singleton
\n
\n
\n
3
\n
4
\n
1.0
\n
1
\n
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n
female
\n
35.0
\n
1
\n
0
\n
113803
\n
53.1000
\n
C123
\n
S
\n
Mrs
\n
Futrelle
\n
2
\n
Futrelle_2
\n
small
\n
\n
\n
4
\n
5
\n
0.0
\n
3
\n
Allen, Mr. William Henry
\n
male
\n
35.0
\n
0
\n
0
\n
373450
\n
8.0500
\n
NaN
\n
S
\n
Mr
\n
Allen
\n
1
\n
Allen_1
\n
singleton
\n
\n \n
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":112}],"execution_count":112},{"cell_type":"code","source":"# Crear el mosaic plot\nfrom statsmodels.graphics.mosaicplot import mosaic\nts=pd.crosstab(full.FsizeD, full.Survived)\nts.columns=['No','Si']\nts=ts.reset_index()\nts1=pd.melt(ts,id_vars=['FsizeD'])\nG = ts1.groupby([\"FsizeD\", \"variable\"]).sum()\nmosaic(G[\"value\"])","metadata":{"id":"bO4sfgd9iqZk","colab":{"height":755,"base_uri":"https://localhost:8080/"},"cell_id":"b60b7ea81223444ab5a24813ca78e4b6","outputId":"8490cdb2-12c8-4a17-b715-a16751068082","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":438,"user_tz":180,"timestamp":1650298301912},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"(
","image/png":"\n"},"metadata":{"needs_background":"light"}}],"execution_count":113},{"cell_type":"markdown","source":"La trama de mosaico muestra que preservamos nuestra regla de que hay una penalización de supervivencia entre los solteros y las familias numerosas, pero un beneficio para los pasajeros de familias pequeñas. Podemos hacer algo más con nuestra variable de edad, pero faltan valores de edad en 263 filas","metadata":{"id":"nLIx9tTqaDXe","cell_id":"5ac5a7fbc3bc45bcb732278f5191148d","deepnote_cell_type":"markdown"}},{"cell_type":"markdown","source":"## Tratar algunas otras variables","metadata":{"id":"sNr2WekbaP6W","cell_id":"7a3959cb8daf4ad5aa69aab2a26b6ecb","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"# Esta variable tiene muchos nulos\nfull.Cabin[1:28]","metadata":{"id":"aip43ji8aRxk","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"f7bc41ccabc44612b9ddcda64ae13b58","outputId":"819268cc-f6df-4e82-b6bd-c1b388d85cf6","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":411,"user_tz":180,"timestamp":1650298305484},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"1 C85\n2 NaN\n3 C123\n4 NaN\n5 NaN\n6 E46\n7 NaN\n8 NaN\n9 NaN\n10 G6\n11 C103\n12 NaN\n13 NaN\n14 NaN\n15 NaN\n16 NaN\n17 NaN\n18 NaN\n19 NaN\n20 NaN\n21 D56\n22 NaN\n23 A6\n24 NaN\n25 NaN\n26 NaN\n27 C23 C25 C27\nName: Cabin, dtype: object"},"metadata":{},"execution_count":114}],"execution_count":114},{"cell_type":"markdown","source":"Extraigamos las primeras letras de la columna Cabin que representa el Deck","metadata":{"id":"hKFWT7O0b9QR","cell_id":"0835b8a0cfe945ebabb661ab33ba3ebf","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"full['Deck']=full['Cabin'].astype(str).str[0]\nfull['Deck']=full.Deck.str.upper()\nfull.head()","metadata":{"id":"WIjnuzdZcDQS","colab":{"height":206,"base_uri":"https://localhost:8080/"},"cell_id":"8e1994763fbb4b4fa697f0bb473fbac6","outputId":"27826faf-69f5-4a4f-ac2a-4c761b589829","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":403,"user_tz":180,"timestamp":1650298307739},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" PassengerId Survived Pclass \\\n0 1 0.0 3 \n1 2 1.0 1 \n2 3 1.0 3 \n3 4 1.0 1 \n4 5 0.0 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n\n Parch Ticket Fare Cabin Embarked Title Surname Fsize \\\n0 0 A/5 21171 7.2500 NaN S Mr Braund 2 \n1 0 PC 17599 71.2833 C85 C Mrs Cumings 2 \n2 0 STON/O2. 3101282 7.9250 NaN S Miss Heikkinen 1 \n3 0 113803 53.1000 C123 S Mrs Futrelle 2 \n4 0 373450 8.0500 NaN S Mr Allen 1 \n\n Family FsizeD Deck \n0 Braund_2 small N \n1 Cumings_2 small C \n2 Heikkinen_1 singleton N \n3 Futrelle_2 small C \n4 Allen_1 singleton N ","text/html":"\n
\n
\n
\n\n
\n \n
\n
\n
PassengerId
\n
Survived
\n
Pclass
\n
Name
\n
Sex
\n
Age
\n
SibSp
\n
Parch
\n
Ticket
\n
Fare
\n
Cabin
\n
Embarked
\n
Title
\n
Surname
\n
Fsize
\n
Family
\n
FsizeD
\n
Deck
\n
\n \n \n
\n
0
\n
1
\n
0.0
\n
3
\n
Braund, Mr. Owen Harris
\n
male
\n
22.0
\n
1
\n
0
\n
A/5 21171
\n
7.2500
\n
NaN
\n
S
\n
Mr
\n
Braund
\n
2
\n
Braund_2
\n
small
\n
N
\n
\n
\n
1
\n
2
\n
1.0
\n
1
\n
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n
female
\n
38.0
\n
1
\n
0
\n
PC 17599
\n
71.2833
\n
C85
\n
C
\n
Mrs
\n
Cumings
\n
2
\n
Cumings_2
\n
small
\n
C
\n
\n
\n
2
\n
3
\n
1.0
\n
3
\n
Heikkinen, Miss. Laina
\n
female
\n
26.0
\n
0
\n
0
\n
STON/O2. 3101282
\n
7.9250
\n
NaN
\n
S
\n
Miss
\n
Heikkinen
\n
1
\n
Heikkinen_1
\n
singleton
\n
N
\n
\n
\n
3
\n
4
\n
1.0
\n
1
\n
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n
female
\n
35.0
\n
1
\n
0
\n
113803
\n
53.1000
\n
C123
\n
S
\n
Mrs
\n
Futrelle
\n
2
\n
Futrelle_2
\n
small
\n
C
\n
\n
\n
4
\n
5
\n
0.0
\n
3
\n
Allen, Mr. William Henry
\n
male
\n
35.0
\n
0
\n
0
\n
373450
\n
8.0500
\n
NaN
\n
S
\n
Mr
\n
Allen
\n
1
\n
Allen_1
\n
singleton
\n
N
\n
\n \n
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":115}],"execution_count":115},{"cell_type":"code","source":"full.Deck.value_counts()","metadata":{"id":"TQLCschkc_qQ","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"b12dd5df079c46fba378a3ba65f77534","outputId":"abde8b19-04ba-4653-a5d2-8fa2f1269f6a","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":255,"user_tz":180,"timestamp":1650298310866},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"N 1014\nC 94\nB 65\nD 46\nE 41\nA 22\nF 21\nG 5\nT 1\nName: Deck, dtype: int64"},"metadata":{},"execution_count":116}],"execution_count":116},{"cell_type":"markdown","source":"# Problema de nulos\n\nAhora estamos listos para comenzar a explorar los datos faltantes y rectificarlos a través de la imputación. Hay varias maneras diferentes en las que podríamos hacer esto. \n\nDado el pequeño tamaño del conjunto de datos, probablemente no deberíamos optar por eliminar observaciones completas (filas) o variables (columnas) que contengan valores faltantes. \n\nNos queda la opción de reemplazar los valores faltantes con valores sensibles dada la distribución de los datos, por ejemplo, la media, la mediana o la moda. Finalmente, podríamos ir con la predicción. Usaremos los dos últimos métodos y confiaré en alguna visualización de datos para guiar nuestras decisiones.","metadata":{"id":"2N6sMTnzdGuQ","cell_id":"8a89db6bea16485185f9b12300741797","deepnote_cell_type":"markdown"}},{"cell_type":"markdown","source":"## Imputacion sensible","metadata":{"id":"j9bCj2oadzp2","cell_id":"33b2f33649d84a6d81b29ddde0a4ec38","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"full.dtypes","metadata":{"id":"8SlTBp4ve88B","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"90d451c5abb34ca1990097430f147345","outputId":"7753aa03-38a1-4d76-ba06-60fc32e07d5e","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":289,"user_tz":180,"timestamp":1650298313893},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"PassengerId int64\nSurvived float64\nPclass int64\nName object\nSex object\nAge float64\nSibSp int64\nParch int64\nTicket object\nFare float64\nCabin object\nEmbarked object\nTitle object\nSurname object\nFsize int64\nFamily object\nFsizeD object\nDeck object\ndtype: object"},"metadata":{},"execution_count":117}],"execution_count":117},{"cell_type":"code","source":"full.Embarked.isnull().sum()","metadata":{"id":"3VuNAN5hfjJq","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"0ceb05c8aa654b84af5db40898d2c1a0","outputId":"5763c492-36f4-4d2d-b842-42f5041b0a96","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":399,"user_tz":180,"timestamp":1650298319500},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"2"},"metadata":{},"execution_count":118}],"execution_count":118},{"cell_type":"code","source":" # Pasajeros 62 y 830 no tienen datos\n full[(full.PassengerId == 62) | (full.PassengerId == 830)]['Embarked']","metadata":{"id":"_tZBx302fBN1","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"ff762ac88de4481d9565ac3160d7817c","outputId":"5719eb09-9164-456c-d30b-d5bf5393e7e5","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":3,"user_tz":180,"timestamp":1650298320894},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"61 NaN\n829 NaN\nName: Embarked, dtype: object"},"metadata":{},"execution_count":119}],"execution_count":119},{"cell_type":"code","source":"# elimiar los ids de pasajeros\nembark_fare = full[(full.PassengerId !=62)|(full.PassengerId !=830)]\nembark_fare","metadata":{"id":"e6IQaPa9fveg","colab":{"height":520,"base_uri":"https://localhost:8080/"},"cell_id":"b6d86a129ebd4aef8cc0592f247a8fa7","outputId":"fcee2067-1c7e-4b9b-ebc6-4a2431e8054f","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":7,"user_tz":180,"timestamp":1650298322434},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" PassengerId Survived Pclass \\\n0 1 0.0 3 \n1 2 1.0 1 \n2 3 1.0 3 \n3 4 1.0 1 \n4 5 0.0 3 \n.. ... ... ... \n413 1305 NaN 3 \n414 1306 NaN 1 \n415 1307 NaN 3 \n416 1308 NaN 3 \n417 1309 NaN 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n.. ... ... ... ... \n413 Spector, Mr. Woolf male NaN 0 \n414 Oliva y Ocana, Dona. Fermina female 39.0 0 \n415 Saether, Mr. Simon Sivertsen male 38.5 0 \n416 Ware, Mr. Frederick male NaN 0 \n417 Peter, Master. Michael J male NaN 1 \n\n Parch Ticket Fare Cabin Embarked Title \\\n0 0 A/5 21171 7.2500 NaN S Mr \n1 0 PC 17599 71.2833 C85 C Mrs \n2 0 STON/O2. 3101282 7.9250 NaN S Miss \n3 0 113803 53.1000 C123 S Mrs \n4 0 373450 8.0500 NaN S Mr \n.. ... ... ... ... ... ... \n413 0 A.5. 3236 8.0500 NaN S Mr \n414 0 PC 17758 108.9000 C105 C Rare Title \n415 0 SOTON/O.Q. 3101262 7.2500 NaN S Mr \n416 0 359309 8.0500 NaN S Mr \n417 1 2668 22.3583 NaN C Master \n\n Surname Fsize Family FsizeD Deck \n0 Braund 2 Braund_2 small N \n1 Cumings 2 Cumings_2 small C \n2 Heikkinen 1 Heikkinen_1 singleton N \n3 Futrelle 2 Futrelle_2 small C \n4 Allen 1 Allen_1 singleton N \n.. ... ... ... ... ... \n413 Spector 1 Spector_1 singleton N \n414 Oliva y Ocana 1 Oliva y Ocana_1 singleton C \n415 Saether 1 Saether_1 singleton N \n416 Ware 1 Ware_1 singleton N \n417 Peter 3 Peter_3 small N \n\n[1309 rows x 18 columns]","text/html":"\n
\n
\n
\n\n
\n \n
\n
\n
PassengerId
\n
Survived
\n
Pclass
\n
Name
\n
Sex
\n
Age
\n
SibSp
\n
Parch
\n
Ticket
\n
Fare
\n
Cabin
\n
Embarked
\n
Title
\n
Surname
\n
Fsize
\n
Family
\n
FsizeD
\n
Deck
\n
\n \n \n
\n
0
\n
1
\n
0.0
\n
3
\n
Braund, Mr. Owen Harris
\n
male
\n
22.0
\n
1
\n
0
\n
A/5 21171
\n
7.2500
\n
NaN
\n
S
\n
Mr
\n
Braund
\n
2
\n
Braund_2
\n
small
\n
N
\n
\n
\n
1
\n
2
\n
1.0
\n
1
\n
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n
female
\n
38.0
\n
1
\n
0
\n
PC 17599
\n
71.2833
\n
C85
\n
C
\n
Mrs
\n
Cumings
\n
2
\n
Cumings_2
\n
small
\n
C
\n
\n
\n
2
\n
3
\n
1.0
\n
3
\n
Heikkinen, Miss. Laina
\n
female
\n
26.0
\n
0
\n
0
\n
STON/O2. 3101282
\n
7.9250
\n
NaN
\n
S
\n
Miss
\n
Heikkinen
\n
1
\n
Heikkinen_1
\n
singleton
\n
N
\n
\n
\n
3
\n
4
\n
1.0
\n
1
\n
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n
female
\n
35.0
\n
1
\n
0
\n
113803
\n
53.1000
\n
C123
\n
S
\n
Mrs
\n
Futrelle
\n
2
\n
Futrelle_2
\n
small
\n
C
\n
\n
\n
4
\n
5
\n
0.0
\n
3
\n
Allen, Mr. William Henry
\n
male
\n
35.0
\n
0
\n
0
\n
373450
\n
8.0500
\n
NaN
\n
S
\n
Mr
\n
Allen
\n
1
\n
Allen_1
\n
singleton
\n
N
\n
\n
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
\n
\n
413
\n
1305
\n
NaN
\n
3
\n
Spector, Mr. Woolf
\n
male
\n
NaN
\n
0
\n
0
\n
A.5. 3236
\n
8.0500
\n
NaN
\n
S
\n
Mr
\n
Spector
\n
1
\n
Spector_1
\n
singleton
\n
N
\n
\n
\n
414
\n
1306
\n
NaN
\n
1
\n
Oliva y Ocana, Dona. Fermina
\n
female
\n
39.0
\n
0
\n
0
\n
PC 17758
\n
108.9000
\n
C105
\n
C
\n
Rare Title
\n
Oliva y Ocana
\n
1
\n
Oliva y Ocana_1
\n
singleton
\n
C
\n
\n
\n
415
\n
1307
\n
NaN
\n
3
\n
Saether, Mr. Simon Sivertsen
\n
male
\n
38.5
\n
0
\n
0
\n
SOTON/O.Q. 3101262
\n
7.2500
\n
NaN
\n
S
\n
Mr
\n
Saether
\n
1
\n
Saether_1
\n
singleton
\n
N
\n
\n
\n
416
\n
1308
\n
NaN
\n
3
\n
Ware, Mr. Frederick
\n
male
\n
NaN
\n
0
\n
0
\n
359309
\n
8.0500
\n
NaN
\n
S
\n
Mr
\n
Ware
\n
1
\n
Ware_1
\n
singleton
\n
N
\n
\n
\n
417
\n
1309
\n
NaN
\n
3
\n
Peter, Master. Michael J
\n
male
\n
NaN
\n
1
\n
1
\n
2668
\n
22.3583
\n
NaN
\n
C
\n
Master
\n
Peter
\n
3
\n
Peter_3
\n
small
\n
N
\n
\n \n
\n
1309 rows × 18 columns
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":120}],"execution_count":120},{"cell_type":"markdown","source":"Inferiremos sus valores para el embarque en base a los datos actuales que imaginamos pueden ser relevantes: clase de pasajero y tarifa. Vemos que pagaron $80 y $NA respectivamente y sus clases son 1 y NA. Entonces, ¿de dónde se embarcaron?","metadata":{"id":"qf2nOUy8gOZn","cell_id":"69ba022c46c14ee4a0b8475fe32058f2","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"plt.figure(figsize=(10,6))\nsns.boxplot(x='Embarked',y= 'Fare',hue='Pclass',data= embark_fare)\nplt.axhline(y = 80, color = 'r', linestyle = '--')","metadata":{"id":"lKSEhlmsgOBa","colab":{"height":405,"base_uri":"https://localhost:8080/"},"cell_id":"53ddc5680f2d41b780f5248cd5e7fe9c","outputId":"aae78336-255c-4c40-8839-1c046ce389d6","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":687,"user_tz":180,"timestamp":1650298328791},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":""},"metadata":{},"execution_count":121},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}],"execution_count":121},{"cell_type":"markdown","source":"¡Voila! La tarifa mediana para un pasajero de primera clase que sale de Charbourg (\"C\") coincide muy bien con los $80 que pagan nuestros pasajeros con problemas de embarque. Creo que podemos reemplazar con seguridad los valores NA con 'C'.","metadata":{"id":"RZUAO1tyhOlA","cell_id":"f563514018c747638596d539c7a09721","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"# Dado que su tarifa era de $ 80 por primera clase, lo más probable es que se embarcaran desde 'C' \nfull.loc[full.PassengerId == 62, 'Embarked']= 'C'\nfull.loc[full.PassengerId == 380, 'Embarked']= 'C'\nfull.Embarked.value_counts() # Verificamos ","metadata":{"id":"-k1NJVPHhTeU","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"31f13b000f3048c99590dffebd7a9e4a","outputId":"b64d56b2-a7e1-4d60-9842-acb5a5b1316a","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":3,"user_tz":180,"timestamp":1650298330723},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"S 913\nC 272\nQ 123\nName: Embarked, dtype: int64"},"metadata":{},"execution_count":122}],"execution_count":122},{"cell_type":"markdown","source":"Estamos cerca de arreglar el puñado de valores de NA aquí y allá. El pasajero de la fila 1043 tiene un valor de tarifa NA.","metadata":{"id":"1low24kUiAjg","cell_id":"d38458ec379748ebb38529837b161fd6","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"full.iloc[1043,:]","metadata":{"id":"7LkA-raqiCyf","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"a80e67bb669643259bb147f8f546a910","outputId":"1d018e3d-283d-493f-f9bd-d82f6416f0c7","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":415,"user_tz":180,"timestamp":1650298333219},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"PassengerId 1044\nSurvived NaN\nPclass 3\nName Storey, Mr. Thomas\nSex male\nAge 60.5\nSibSp 0\nParch 0\nTicket 3701\nFare NaN\nCabin NaN\nEmbarked S\nTitle Mr\nSurname Storey\nFsize 1\nFamily Storey_1\nFsizeD singleton\nDeck N\nName: 152, dtype: object"},"metadata":{},"execution_count":123}],"execution_count":123},{"cell_type":"markdown","source":"Este es un pasajero de tercera clase que partió de Southampton ('S'). Visualicemos las tarifas entre todos los demás compartiendo su clase y embarque (n = 494).","metadata":{"id":"-RjLPzuJiY4A","cell_id":"3b7e19f9e8c24225bf131c2eb8a06c1a","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"full[(full.Pclass ==3)& (full.Embarked == 'S')]['Fare'].median()","metadata":{"id":"pVJsvks8kFLc","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"170061f5d64241b2ae36341ff8fa84dd","outputId":"2d28856f-2b42-4927-d800-327f09ede9eb","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":406,"user_tz":180,"timestamp":1650298337639},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"8.05"},"metadata":{},"execution_count":124}],"execution_count":124},{"cell_type":"code","source":"plt.figure(figsize=(10,6))\nsns.distplot(full[(full.Pclass ==3)& (full.Embarked == 'S')]['Fare'],kde=True,hist=False)\nplt.axvline(x = full[(full.Pclass ==3)& (full.Embarked == 'S')]['Fare'].median(), color = 'r', linestyle = '--')","metadata":{"id":"XEGuZxBliqxb","colab":{"height":460,"base_uri":"https://localhost:8080/"},"cell_id":"f1774cb780a441ba8b307d5ad5c4a239","outputId":"d3120b34-f0c8-4120-a55b-6b92e0d7bbf3","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":375,"user_tz":180,"timestamp":1650298339409},"deepnote_cell_type":"code"},"outputs":[{"output_type":"stream","name":"stderr","text":"/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).\n warnings.warn(msg, FutureWarning)\n"},{"output_type":"execute_result","data":{"text/plain":""},"metadata":{},"execution_count":125},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}],"execution_count":125},{"cell_type":"markdown","source":"A partir de esta visualización, parece bastante razonable reemplazar el valor de la tarifa NA con una mediana para su clase y embarque, que es de $8,05.","metadata":{"id":"YtD68Hnrj4GE","cell_id":"bbbd18de53dc4a6999dd3a60c57e5a92","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":" full.iloc[1043,:]","metadata":{"id":"qtHXkLdNl43h","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"277a2c3cefe04742a9655f1ba4276f65","outputId":"adb10ed6-16d6-4899-c56c-2a472ae98d0a","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":3,"user_tz":180,"timestamp":1650298342254},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"PassengerId 1044\nSurvived NaN\nPclass 3\nName Storey, Mr. Thomas\nSex male\nAge 60.5\nSibSp 0\nParch 0\nTicket 3701\nFare NaN\nCabin NaN\nEmbarked S\nTitle Mr\nSurname Storey\nFsize 1\nFamily Storey_1\nFsizeD singleton\nDeck N\nName: 152, dtype: object"},"metadata":{},"execution_count":126}],"execution_count":126},{"cell_type":"code","source":"full.loc[full.PassengerId == 1044, 'Fare']= full[(full.Pclass ==3)& (full.Embarked == 'S')]['Fare'].median()\nfull.iloc[1043,:]","metadata":{"id":"kGvHafLtmF0g","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"a738a285503749dba99d8a1726d5f493","outputId":"b8d00522-045b-4214-d438-2c11fcf22495","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":4,"user_tz":180,"timestamp":1650298343763},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"PassengerId 1044\nSurvived NaN\nPclass 3\nName Storey, Mr. Thomas\nSex male\nAge 60.5\nSibSp 0\nParch 0\nTicket 3701\nFare 8.05\nCabin NaN\nEmbarked S\nTitle Mr\nSurname Storey\nFsize 1\nFamily Storey_1\nFsizeD singleton\nDeck N\nName: 152, dtype: object"},"metadata":{},"execution_count":127}],"execution_count":127},{"cell_type":"markdown","source":"## Imputacion predictiva","metadata":{"id":"_kqnD5eImbZS","cell_id":"ef385687d72441a9927fbc9021c1b8c6","deepnote_cell_type":"markdown"}},{"cell_type":"markdown","source":"Finalmente, como señalamos anteriormente, faltan bastantes valores de Edad en nuestros datos. Vamos a ser un poco más sofisticados en la imputación de valores de edad faltantes. ¿Por qué? Porque podemos. Crearemos un modelo que prediga las edades en función de otras variables.","metadata":{"id":"rQmgybR8tg_h","cell_id":"d090fe8bc4c64755b599503e9ecf2aa5","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"sum(full.Age.isna())","metadata":{"id":"i5WrV9pGmdMa","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"77248a2b12154cccb600e367501344fa","outputId":"68e4902e-8c0a-4108-99ee-80a3caa77913","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":346,"user_tz":180,"timestamp":1650298347097},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"263"},"metadata":{},"execution_count":128}],"execution_count":128},{"cell_type":"code","source":"!pip install fancyimpute","metadata":{"id":"Q6tpIlCHMysJ","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"adc0502760d749f9bd719b3ab0b2c685","outputId":"a9bb7984-60fa-4dfc-8b55-9249364fe5f8","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":3204,"user_tz":180,"timestamp":1650298351469},"deepnote_cell_type":"code"},"outputs":[{"output_type":"stream","name":"stdout","text":"Requirement already satisfied: fancyimpute in /usr/local/lib/python3.7/dist-packages (0.7.0)\nRequirement already satisfied: nose in /usr/local/lib/python3.7/dist-packages (from fancyimpute) (1.3.7)\nRequirement already satisfied: knnimpute>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from fancyimpute) (0.1.0)\nRequirement already satisfied: scikit-learn>=0.24.2 in /usr/local/lib/python3.7/dist-packages (from fancyimpute) (1.0.2)\nRequirement already satisfied: pytest in /usr/local/lib/python3.7/dist-packages (from fancyimpute) (3.6.4)\nRequirement already satisfied: cvxopt in /usr/local/lib/python3.7/dist-packages (from fancyimpute) (1.2.7)\nRequirement already satisfied: cvxpy in /usr/local/lib/python3.7/dist-packages (from fancyimpute) (1.0.31)\nRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from knnimpute>=0.1.0->fancyimpute) (1.15.0)\nRequirement already satisfied: numpy>=1.10 in /usr/local/lib/python3.7/dist-packages (from knnimpute>=0.1.0->fancyimpute) (1.21.5)\nRequirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.24.2->fancyimpute) (1.4.1)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.24.2->fancyimpute) (3.1.0)\nRequirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.24.2->fancyimpute) (1.1.0)\nRequirement already satisfied: scs>=1.1.3 in /usr/local/lib/python3.7/dist-packages (from cvxpy->fancyimpute) (3.2.0)\nRequirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from cvxpy->fancyimpute) (0.70.12.2)\nRequirement already satisfied: osqp>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from cvxpy->fancyimpute) (0.6.2.post0)\nRequirement already satisfied: ecos>=2 in /usr/local/lib/python3.7/dist-packages (from cvxpy->fancyimpute) (2.0.10)\nRequirement already satisfied: qdldl in /usr/local/lib/python3.7/dist-packages (from osqp>=0.4.1->cvxpy->fancyimpute) (0.1.5.post2)\nRequirement already satisfied: dill>=0.3.4 in /usr/local/lib/python3.7/dist-packages (from multiprocess->cvxpy->fancyimpute) (0.3.4)\nRequirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from pytest->fancyimpute) (21.4.0)\nRequirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from pytest->fancyimpute) (1.11.0)\nRequirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from pytest->fancyimpute) (8.12.0)\nRequirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.7/dist-packages (from pytest->fancyimpute) (1.4.0)\nRequirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.7/dist-packages (from pytest->fancyimpute) (0.7.1)\nRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from pytest->fancyimpute) (57.4.0)\n"}],"execution_count":129},{"cell_type":"code","source":"full.dtypes","metadata":{"id":"jIJD8cSWcoCl","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"8cb4243904704996af281e3e64398942","outputId":"f6433683-2887-4f4a-dad9-0ee49636f25a","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":393,"user_tz":180,"timestamp":1650298360358},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"PassengerId int64\nSurvived float64\nPclass int64\nName object\nSex object\nAge float64\nSibSp int64\nParch int64\nTicket object\nFare float64\nCabin object\nEmbarked object\nTitle object\nSurname object\nFsize int64\nFamily object\nFsizeD object\nDeck object\ndtype: object"},"metadata":{},"execution_count":130}],"execution_count":130},{"cell_type":"code","source":"from fancyimpute import IterativeImputer\nmice_impute = IterativeImputer()\ntraindatafill = mice_impute.fit_transform(full[['Age','SibSp','Fare','Survived']])\ntraindatafill= pd.DataFrame(traindatafill)","metadata":{"id":"yazEuLQnM1aV","cell_id":"3a24a71026e54ed2b2a0fcc1e15c07cf","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":318,"user_tz":180,"timestamp":1650298376424},"deepnote_cell_type":"code"},"outputs":[],"execution_count":131},{"cell_type":"code","source":"traindatafill[3].isnull().sum()","metadata":{"id":"SvAQ0_Zlct__","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"f7e09d1ccd804606a48a101eb98fad6c","outputId":"60cb93b2-4534-472a-988b-e22bab4030ae","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":282,"user_tz":180,"timestamp":1650298397863},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"0"},"metadata":{},"execution_count":134}],"execution_count":134},{"cell_type":"code","source":"plt.figure(figsize=(10,6)) \nplt.subplot(121)\nsns.histplot(full.Age)\nplt.ylim([0,170])\nplt.title('Original Age')\nplt.subplot(122)\nsns.histplot(traindatafill[0])\nplt.ylim([0,170])\nplt.title('Modificacion MICE Age')","metadata":{"id":"2V9Qo9DUQYtK","colab":{"height":421,"base_uri":"https://localhost:8080/"},"cell_id":"e83530c57f2b4bc78697dc350fb0ea2a","outputId":"f9d37d38-9e0a-4559-a907-cd512e70ad68","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":1353,"user_tz":180,"timestamp":1650298402150},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"Text(0.5, 1.0, 'Modificacion MICE Age')"},"metadata":{},"execution_count":135},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}],"execution_count":135},{"cell_type":"code","source":"# Reasignacion\nfull['Age']= traindatafill[0] \nfull['Survived']= traindatafill[3] \nprint(full.Age.isnull().sum())\nprint(full.Survived.isnull().sum())","metadata":{"id":"vQZ5syj6RErz","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"4a5977d04e9345d0b3371df6811063f0","outputId":"b367d531-0797-4009-ea47-d570fe0c7c30","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":1809,"user_tz":180,"timestamp":1650298439808},"deepnote_cell_type":"code"},"outputs":[{"output_type":"stream","name":"stdout","text":"0\n0\n"}],"execution_count":136},{"cell_type":"markdown","source":"## Feature Engineering Parte II","metadata":{"id":"Sdd3OYDHTnuj","cell_id":"d0ab58095aa048dd8fc6a59d93d6440f","deepnote_cell_type":"markdown"}},{"cell_type":"markdown","source":"Ahora que sabemos la edad de todos, podemos crear un par de nuevas variables dependientes de la edad: 'Child' y 'Mother'. Un niño será simplemente alguien menor de 18 años y una madre es un pasajero que es 1) mujer, 2) tiene más de 18 años, 3) tiene más de 0 hijos (¡no es broma!), y 4) no tiene el título 'Miss'.","metadata":{"id":"kapmgAx4TqGW","cell_id":"0647699bbace4d71bccaaa06a6404aa7","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"full['Survived']=full['Survived'].astype('str')\nfull.dtypes","metadata":{"id":"xgtskUuEVNEt","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"cd0eb2f869c14c10bb400c9e45618fff","outputId":"0a43f6b0-985f-46f1-a216-4db038ccdfa6","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":1369,"user_tz":180,"timestamp":1650298486518},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"PassengerId int64\nSurvived object\nPclass int64\nName object\nSex object\nAge float64\nSibSp int64\nParch int64\nTicket object\nFare float64\nCabin object\nEmbarked object\nTitle object\nSurname object\nFsize int64\nFamily object\nFsizeD object\nDeck object\ndtype: object"},"metadata":{},"execution_count":138}],"execution_count":138},{"cell_type":"code","source":"df_male= full.loc[full.Sex == 'male']\ndf_male.reset_index()\ndf_female= full.loc[full.Sex == 'female']\ndf_female.reset_index()","metadata":{"id":"fLhJ9ZwYVeSy","colab":{"height":606,"base_uri":"https://localhost:8080/"},"cell_id":"c5d8fd93e258490bab49efc77677b172","outputId":"fc4c7d5f-b65d-4aeb-b1fe-5ba6b2478ff6","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":888,"user_tz":180,"timestamp":1650298493643},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":" index PassengerId Survived Pclass \\\n0 1 2 1.0 1 \n1 2 3 1.0 3 \n2 3 4 1.0 1 \n3 8 9 1.0 3 \n4 9 10 1.0 2 \n.. ... ... ... ... \n461 409 1301 0.0 3 \n462 410 1302 0.0 3 \n463 411 1303 0.0 1 \n464 412 1304 1.0 3 \n465 414 1306 1.0 1 \n\n Name Sex Age \\\n0 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.000000 \n1 Heikkinen, Miss. Laina female 26.000000 \n2 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000 \n3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.000000 \n4 Nasser, Mrs. Nicholas (Adele Achem) female 14.000000 \n.. ... ... ... \n461 Peacock, Miss. Treasteall female 20.235103 \n462 Naughton, Miss. Hannah female 32.368066 \n463 Minahan, Mrs. William Edward (Lillian E Thorpe) female 32.295987 \n464 Henriksson, Miss. Jenny Lovisa female 33.000000 \n465 Oliva y Ocana, Dona. Fermina female 44.000000 \n\n SibSp Parch Ticket Fare Cabin Embarked Title \\\n0 1 0 PC 17599 71.2833 C85 C Mrs \n1 0 0 STON/O2. 3101282 7.9250 NaN S Miss \n2 1 0 113803 53.1000 C123 S Mrs \n3 0 2 347742 11.1333 NaN S Mrs \n4 1 0 237736 30.0708 NaN C Mrs \n.. ... ... ... ... ... ... ... \n461 1 1 SOTON/O.Q. 3101315 13.7750 NaN S Miss \n462 0 0 365237 7.7500 NaN Q Miss \n463 1 0 19928 90.0000 C78 Q Mrs \n464 0 0 347086 7.7750 NaN S Miss \n465 0 0 PC 17758 108.9000 C105 C Rare Title \n\n Surname Fsize Family FsizeD Deck \n0 Cumings 2 Cumings_2 small C \n1 Heikkinen 1 Heikkinen_1 singleton N \n2 Futrelle 2 Futrelle_2 small C \n3 Johnson 3 Johnson_3 small N \n4 Nasser 2 Nasser_2 small N \n.. ... ... ... ... ... \n461 Peacock 3 Peacock_3 small N \n462 Naughton 1 Naughton_1 singleton N \n463 Minahan 2 Minahan_2 small C \n464 Henriksson 1 Henriksson_1 singleton N \n465 Oliva y Ocana 1 Oliva y Ocana_1 singleton C \n\n[466 rows x 19 columns]","text/html":"\n
\n
\n
\n\n
\n \n
\n
\n
index
\n
PassengerId
\n
Survived
\n
Pclass
\n
Name
\n
Sex
\n
Age
\n
SibSp
\n
Parch
\n
Ticket
\n
Fare
\n
Cabin
\n
Embarked
\n
Title
\n
Surname
\n
Fsize
\n
Family
\n
FsizeD
\n
Deck
\n
\n \n \n
\n
0
\n
1
\n
2
\n
1.0
\n
1
\n
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n
female
\n
38.000000
\n
1
\n
0
\n
PC 17599
\n
71.2833
\n
C85
\n
C
\n
Mrs
\n
Cumings
\n
2
\n
Cumings_2
\n
small
\n
C
\n
\n
\n
1
\n
2
\n
3
\n
1.0
\n
3
\n
Heikkinen, Miss. Laina
\n
female
\n
26.000000
\n
0
\n
0
\n
STON/O2. 3101282
\n
7.9250
\n
NaN
\n
S
\n
Miss
\n
Heikkinen
\n
1
\n
Heikkinen_1
\n
singleton
\n
N
\n
\n
\n
2
\n
3
\n
4
\n
1.0
\n
1
\n
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n
female
\n
35.000000
\n
1
\n
0
\n
113803
\n
53.1000
\n
C123
\n
S
\n
Mrs
\n
Futrelle
\n
2
\n
Futrelle_2
\n
small
\n
C
\n
\n
\n
3
\n
8
\n
9
\n
1.0
\n
3
\n
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
\n
female
\n
27.000000
\n
0
\n
2
\n
347742
\n
11.1333
\n
NaN
\n
S
\n
Mrs
\n
Johnson
\n
3
\n
Johnson_3
\n
small
\n
N
\n
\n
\n
4
\n
9
\n
10
\n
1.0
\n
2
\n
Nasser, Mrs. Nicholas (Adele Achem)
\n
female
\n
14.000000
\n
1
\n
0
\n
237736
\n
30.0708
\n
NaN
\n
C
\n
Mrs
\n
Nasser
\n
2
\n
Nasser_2
\n
small
\n
N
\n
\n
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
...
\n
\n
\n
461
\n
409
\n
1301
\n
0.0
\n
3
\n
Peacock, Miss. Treasteall
\n
female
\n
20.235103
\n
1
\n
1
\n
SOTON/O.Q. 3101315
\n
13.7750
\n
NaN
\n
S
\n
Miss
\n
Peacock
\n
3
\n
Peacock_3
\n
small
\n
N
\n
\n
\n
462
\n
410
\n
1302
\n
0.0
\n
3
\n
Naughton, Miss. Hannah
\n
female
\n
32.368066
\n
0
\n
0
\n
365237
\n
7.7500
\n
NaN
\n
Q
\n
Miss
\n
Naughton
\n
1
\n
Naughton_1
\n
singleton
\n
N
\n
\n
\n
463
\n
411
\n
1303
\n
0.0
\n
1
\n
Minahan, Mrs. William Edward (Lillian E Thorpe)
\n
female
\n
32.295987
\n
1
\n
0
\n
19928
\n
90.0000
\n
C78
\n
Q
\n
Mrs
\n
Minahan
\n
2
\n
Minahan_2
\n
small
\n
C
\n
\n
\n
464
\n
412
\n
1304
\n
1.0
\n
3
\n
Henriksson, Miss. Jenny Lovisa
\n
female
\n
33.000000
\n
0
\n
0
\n
347086
\n
7.7750
\n
NaN
\n
S
\n
Miss
\n
Henriksson
\n
1
\n
Henriksson_1
\n
singleton
\n
N
\n
\n
\n
465
\n
414
\n
1306
\n
1.0
\n
1
\n
Oliva y Ocana, Dona. Fermina
\n
female
\n
44.000000
\n
0
\n
0
\n
PC 17758
\n
108.9000
\n
C105
\n
C
\n
Rare Title
\n
Oliva y Ocana
\n
1
\n
Oliva y Ocana_1
\n
singleton
\n
C
\n
\n \n
\n
466 rows × 19 columns
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":139}],"execution_count":139},{"cell_type":"code","source":"# Ahora miremos la relacion edad vs survival\nplt.figure(figsize=(12,6))\nplt.subplot(121)\nsns.histplot(df_male.loc[df_male['Survived']=='1.0']['Age'], color='orange',label='1')\nsns.histplot(df_male.loc[df_male['Survived']=='0.0']['Age'],color='red',label='0')\nplt.legend()\nplt.title('Male')\nplt.ylim([0,100])\nplt.subplot(122)\nsns.histplot(df_female.loc[df_female['Survived']=='1.0']['Age'], color='orange',label='1')\nsns.histplot(df_female.loc[df_female['Survived']=='0.0']['Age'],color='red',label='0')\nplt.legend()\nplt.title('Female')\nplt.ylim([0,100])","metadata":{"id":"JUxNwuKIRN5l","colab":{"height":421,"base_uri":"https://localhost:8080/"},"cell_id":"ecb9b395e0f84806944e0bde5c167cfd","outputId":"ddaedcc9-469c-40d0-d2f7-5b2beeba21de","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":22,"user_tz":180,"timestamp":1650298503963},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"(0.0, 100.0)"},"metadata":{},"execution_count":140},{"output_type":"display_data","data":{"text/plain":"
","image/png":"\n"},"metadata":{"needs_background":"light"}}],"execution_count":140},{"cell_type":"code","source":"# Crear la columna child\nfull['Child']= np.where(full['Age']<18, 'Child','Adult')\n# mostrar conteos\npd.crosstab(full.Child, full.Survived)","metadata":{"id":"8YT3cVnGXWyZ","colab":{"height":143,"base_uri":"https://localhost:8080/"},"cell_id":"2ab2c90705e940c2ab90af473eb22e49","outputId":"9c34ca15-3310-40b2-bb45-794987ca5ddf","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":8,"user_tz":180,"timestamp":1650298524695},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"Survived 0.0 1.0\nChild \nAdult 714 415\nChild 90 90","text/html":"\n
\n
\n
\n\n
\n \n
\n
Survived
\n
0.0
\n
1.0
\n
\n
\n
Child
\n
\n
\n
\n \n \n
\n
Adult
\n
714
\n
415
\n
\n
\n
Child
\n
90
\n
90
\n
\n \n
\n
\n \n \n \n\n \n
\n
\n "},"metadata":{},"execution_count":141}],"execution_count":141},{"cell_type":"markdown","source":"Parece que ser un niño no duele, ¡pero tampoco necesariamente te salvará! Terminaremos nuestra ingeniería de características creando la variable Madre. Tal vez podamos esperar que las madres tengan más probabilidades de haber sobrevivido en el Titanic.","metadata":{"id":"Z968WIvodc55","cell_id":"2f4c0643e50e45b69eda5057fc2734a3","deepnote_cell_type":"markdown"}},{"cell_type":"code","source":"# Creando la variable Mother\nfull['Mother'] = 'Not Mother'\nfull.loc[(full.Sex == 'female') & (full.Parch >0) & (full.Age >18) & (full.Title != 'Miss'), 'Mother']= 'Mother'\nfull.Mother.value_counts()","metadata":{"id":"m_NECJKddf0Y","colab":{"base_uri":"https://localhost:8080/"},"cell_id":"386cfdc999f24aad83699c6c8f7a6191","outputId":"279cb2cb-0deb-4c3b-9ae5-0c1bf98c9a37","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":287,"user_tz":180,"timestamp":1650298742385},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"Not Mother 1231\nMother 78\nName: Mother, dtype: int64"},"metadata":{},"execution_count":142}],"execution_count":142},{"cell_type":"code","source":"pd.crosstab(full.Mother, full.Survived)","metadata":{"id":"j9K679N4eHOE","colab":{"height":143,"base_uri":"https://localhost:8080/"},"cell_id":"9761faf048bf43dab9e657a037779337","outputId":"a22fbc08-270f-4b30-b83f-d8bd65ac1e2d","executionInfo":{"user":{"userId":"09471607480253994520","displayName":"David Francisco Bustos Usta"},"status":"ok","elapsed":509,"user_tz":180,"timestamp":1650298775164},"deepnote_cell_type":"code"},"outputs":[{"output_type":"execute_result","data":{"text/plain":"Survived 0.0 1.0\nMother \nMother 33 45\nNot Mother 771 460","text/html":"\n