{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:08.986555226Z", "start_time": "2023-12-05T11:11:56.436113158Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%useLatestDescriptors\n", "%use dataframe, lets-plot" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:15.162136471Z", "start_time": "2023-12-05T11:12:08.742280317Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhomedest
11Allen, Miss. Elisabeth Waltonnull29.000000nullnull24160211.337500B5null2nullSt Louis, MO
11Allison, Master. Hudson Trevormale0.91670012113781151.550000C22 C26AA11nullMontreal, PQ / Chesterville, ON
10Allison, Miss. Helen Lorainefemale2.00000012113781151.550000C22 C26SnullnullMontreal, PQ / Chesterville, ON
10Allison, Mr. Hudson Joshua Creightonmale30.00000012113781151.550000C22 C26Snull135Montreal, PQ / Chesterville, ON
10Allison, Mrs. Hudson J C (Bessie Wald...female25.00000012113781151.550000C22 C26SnullnullMontreal, PQ / Chesterville, ON
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":14,\"columns\":[\"pclass\",\"survived\",\"name\",\"sex\",\"age\",\"sibsp\",\"parch\",\"ticket\",\"fare\",\"cabin\",\"embarked\",\"boat\",\"body\",\"homedest\"],\"kotlin_dataframe\":[{\"pclass\":1,\"survived\":1,\"name\":\"Allen, Miss. Elisabeth Walton\",\"sex\":null,\"age\":29.0,\"sibsp\":null,\"parch\":null,\"ticket\":\"24160\",\"fare\":211.3375,\"cabin\":\"B5\",\"embarked\":null,\"boat\":\"2\",\"body\":null,\"homedest\":\"St Louis, MO\"},{\"pclass\":1,\"survived\":1,\"name\":\"Allison, Master. Hudson Trevor\",\"sex\":\"male\",\"age\":0.9167,\"sibsp\":1,\"parch\":2,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"AA\",\"boat\":\"11\",\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Miss. Helen Loraine\",\"sex\":\"female\",\"age\":2.0,\"sibsp\":1,\"parch\":2,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Mr. Hudson Joshua Creighton\",\"sex\":\"male\",\"age\":30.0,\"sibsp\":1,\"parch\":2,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":135,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)\",\"sex\":\"female\",\"age\":25.0,\"sibsp\":1,\"parch\":2,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 2 } ], "source": [ "var df = DataFrame.readCSV(\n", " fileOrUrl = \"../../idea-examples/titanic/src/main/resources/titanic.csv\",\n", " delimiter = ';',\n", " parserOptions = ParserOptions(locale = java.util.Locale.FRENCH),\n", ")\n", "\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have a dataset which uses an alternative pattern for decimal numbers. This is a reason why the French locale will be used in the example.\n", "\n", "But before data conversion, we should to handle *null* values." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:15.987714818Z", "start_time": "2023-12-05T11:12:15.153641630Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
nametypecountuniquenullstopfreqmeanstdminmedianmax
pclassInt13093037092.2948820.837836133
survivedInt13092008090.3819710.486055001
nameString130913070Connolly, Miss. Kate2nullnullAbbing, Mr. AnthonyKink, Mr. Vincenzvan Melkebeke, Mr. Philemon
sexString?130931male843nullnullfemalemalemale
ageDouble?13099926324.0000004729.88113514.4135000.16670028.00000080.000000
sibspInt?13098108900.4992351.041965008
parchInt?130991010010.3853210.865826009
ticketString13099290CA. 234311nullnull110152347082WE/P 5735
fareDouble?130928218.0500006033.29547951.7586680.00000014.454200512.329200
cabinString?13091871014C23 C25 C276nullnullA10C62 C64T
embarkedString?130953S912nullnullAASS
boatString?1309288231339nullnull13D
bodyInt?130912211881351160.80991797.6969221155328
homedestString?1309370564New York, NY64nullnull?Havana, CubaLyndhurst, EnglandZurich, Switzerland
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":14,\"ncol\":12,\"columns\":[\"name\",\"type\",\"count\",\"unique\",\"nulls\",\"top\",\"freq\",\"mean\",\"std\",\"min\",\"median\",\"max\"],\"kotlin_dataframe\":[{\"name\":\"pclass\",\"type\":\"Int\",\"count\":1309,\"unique\":3,\"nulls\":0,\"top\":\"3\",\"freq\":709,\"mean\":2.294881588999236,\"std\":0.837836018970131,\"min\":\"1\",\"median\":\"3\",\"max\":\"3\"},{\"name\":\"survived\",\"type\":\"Int\",\"count\":1309,\"unique\":2,\"nulls\":0,\"top\":\"0\",\"freq\":809,\"mean\":0.3819709702062643,\"std\":0.48605517086648004,\"min\":\"0\",\"median\":\"0\",\"max\":\"1\"},{\"name\":\"name\",\"type\":\"String\",\"count\":1309,\"unique\":1307,\"nulls\":0,\"top\":\"Connolly, Miss. Kate\",\"freq\":2,\"mean\":null,\"std\":null,\"min\":\"Abbing, Mr. Anthony\",\"median\":\"Kink, Mr. Vincenz\",\"max\":\"van Melkebeke, Mr. Philemon\"},{\"name\":\"sex\",\"type\":\"String?\",\"count\":1309,\"unique\":3,\"nulls\":1,\"top\":\"male\",\"freq\":843,\"mean\":null,\"std\":null,\"min\":\"female\",\"median\":\"male\",\"max\":\"male\"},{\"name\":\"age\",\"type\":\"Double?\",\"count\":1309,\"unique\":99,\"nulls\":263,\"top\":\"24.0\",\"freq\":47,\"mean\":29.8811345124283,\"std\":14.413499699923594,\"min\":\"0.1667\",\"median\":\"28.0\",\"max\":\"80.0\"},{\"name\":\"sibsp\",\"type\":\"Int?\",\"count\":1309,\"unique\":8,\"nulls\":1,\"top\":\"0\",\"freq\":890,\"mean\":0.49923547400611623,\"std\":1.041965373922986,\"min\":\"0\",\"median\":\"0\",\"max\":\"8\"},{\"name\":\"parch\",\"type\":\"Int?\",\"count\":1309,\"unique\":9,\"nulls\":1,\"top\":\"0\",\"freq\":1001,\"mean\":0.3853211009174312,\"std\":0.8658257885990794,\"min\":\"0\",\"median\":\"0\",\"max\":\"9\"},{\"name\":\"ticket\",\"type\":\"String\",\"count\":1309,\"unique\":929,\"nulls\":0,\"top\":\"CA. 2343\",\"freq\":11,\"mean\":null,\"std\":null,\"min\":\"110152\",\"median\":\"347082\",\"max\":\"WE/P 5735\"},{\"name\":\"fare\",\"type\":\"Double?\",\"count\":1309,\"unique\":282,\"nulls\":1,\"top\":\"8.05\",\"freq\":60,\"mean\":33.29547928134572,\"std\":51.758668239174135,\"min\":\"0.0\",\"median\":\"14.4542\",\"max\":\"512.3292\"},{\"name\":\"cabin\",\"type\":\"String?\",\"count\":1309,\"unique\":187,\"nulls\":1014,\"top\":\"C23 C25 C27\",\"freq\":6,\"mean\":null,\"std\":null,\"min\":\"A10\",\"median\":\"C62 C64\",\"max\":\"T\"},{\"name\":\"embarked\",\"type\":\"String?\",\"count\":1309,\"unique\":5,\"nulls\":3,\"top\":\"S\",\"freq\":912,\"mean\":null,\"std\":null,\"min\":\"AA\",\"median\":\"S\",\"max\":\"S\"},{\"name\":\"boat\",\"type\":\"String?\",\"count\":1309,\"unique\":28,\"nulls\":823,\"top\":\"13\",\"freq\":39,\"mean\":null,\"std\":null,\"min\":\"1\",\"median\":\"3\",\"max\":\"D\"},{\"name\":\"body\",\"type\":\"Int?\",\"count\":1309,\"unique\":122,\"nulls\":1188,\"top\":\"135\",\"freq\":1,\"mean\":160.8099173553719,\"std\":97.6969219960031,\"min\":\"1\",\"median\":\"155\",\"max\":\"328\"},{\"name\":\"homedest\",\"type\":\"String?\",\"count\":1309,\"unique\":370,\"nulls\":564,\"top\":\"New York, NY\",\"freq\":64,\"mean\":null,\"std\":null,\"min\":\"?Havana, Cuba\",\"median\":\"Lyndhurst, England\",\"max\":\"Zurich, Switzerland\"}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 3 } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:16.332958837Z", "start_time": "2023-12-05T11:12:15.706033181Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhomedest
11Allen, Miss. Elisabeth Waltonnull29.000000nullnull24160211.337500B5null2nullSt Louis, MO
11Allison, Master. Hudson Trevormale0.91670012113781151.550000C22 C26AA11nullMontreal, PQ / Chesterville, ON
10Allison, Miss. Helen Lorainefemale2.00000012113781151.550000C22 C26SnullnullMontreal, PQ / Chesterville, ON
10Allison, Mr. Hudson Joshua Creightonmale30.00000012113781151.550000C22 C26Snull135Montreal, PQ / Chesterville, ON
10Allison, Mrs. Hudson J C (Bessie Wald...female25.00000012113781151.550000C22 C26SnullnullMontreal, PQ / Chesterville, ON
11Anderson, Mr. Harrymale48.000000001995226.550000E12S3nullNew York, NY
11Andrews, Miss. Kornelia Theodosiafemale63.000000101350277.958300D7S10nullHudson, NY
10Andrews, Mr. Thomas Jrmale39.000000001120500.000000A36SnullnullBelfast, NI
11Appleton, Mrs. Edward Dale (Charlotte...female53.000000201176951.479200C101SDnullBayside, Queens, NY
10Artagaveytia, Mr. Ramonmale71.00000000PC 1760949.504200nullCnull22Montevideo, Uruguay
10Astor, Col. John Jacobmale47.00000010PC 17757227.525000C62 C64Cnull124New York, NY
11Astor, Mrs. John Jacob (Madeleine Tal...female18.00000010PC 17757227.525000C62 C64C4nullNew York, NY
11Aubart, Mme. Leontine Paulinefemale24.00000000PC 1747769.300000B35C9nullParis, France
11Barber, Miss. Ellen "Nellie"female26.000000001987778.850000nullS6nullnull
11Barkworth, Mr. Algernon Henry Wilsonmale80.000000002704230.000000A23SBnullHessle, Yorks
10Baumann, Mr. John Dmalenull00PC 1731825.925000nullSnullnullNew York, NY
10Baxter, Mr. Quigg Edmondmale24.00000001PC 17558247.520800B58 B60CnullnullMontreal, PQ
11Baxter, Mrs. James (Helene DeLaudenie...female50.00000001PC 17558247.520800B58 B60C6nullMontreal, PQ
11Bazzani, Miss. Albinafemale32.000000001181376.291700D15C8nullnull
10Beattie, Mr. Thomsonmale36.000000001305075.241700C6CAnullWinnipeg, MN
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":1309,\"ncol\":14,\"columns\":[\"pclass\",\"survived\",\"name\",\"sex\",\"age\",\"sibsp\",\"parch\",\"ticket\",\"fare\",\"cabin\",\"embarked\",\"boat\",\"body\",\"homedest\"],\"kotlin_dataframe\":[{\"pclass\":1,\"survived\":1,\"name\":\"Allen, Miss. Elisabeth Walton\",\"sex\":null,\"age\":29.0,\"sibsp\":null,\"parch\":null,\"ticket\":\"24160\",\"fare\":211.3375,\"cabin\":\"B5\",\"embarked\":null,\"boat\":\"2\",\"body\":null,\"homedest\":\"St Louis, MO\"},{\"pclass\":1,\"survived\":1,\"name\":\"Allison, Master. Hudson Trevor\",\"sex\":\"male\",\"age\":0.9167,\"sibsp\":1,\"parch\":2,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"AA\",\"boat\":\"11\",\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Miss. Helen Loraine\",\"sex\":\"female\",\"age\":2.0,\"sibsp\":1,\"parch\":2,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Mr. Hudson Joshua Creighton\",\"sex\":\"male\",\"age\":30.0,\"sibsp\":1,\"parch\":2,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":135,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)\",\"sex\":\"female\",\"age\":25.0,\"sibsp\":1,\"parch\":2,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":1,\"name\":\"Anderson, Mr. Harry\",\"sex\":\"male\",\"age\":48.0,\"sibsp\":0,\"parch\":0,\"ticket\":\"19952\",\"fare\":26.55,\"cabin\":\"E12\",\"embarked\":\"S\",\"boat\":\"3\",\"body\":null,\"homedest\":\"New York, NY\"},{\"pclass\":1,\"survived\":1,\"name\":\"Andrews, Miss. Kornelia Theodosia\",\"sex\":\"female\",\"age\":63.0,\"sibsp\":1,\"parch\":0,\"ticket\":\"13502\",\"fare\":77.9583,\"cabin\":\"D7\",\"embarked\":\"S\",\"boat\":\"10\",\"body\":null,\"homedest\":\"Hudson, NY\"},{\"pclass\":1,\"survived\":0,\"name\":\"Andrews, Mr. Thomas Jr\",\"sex\":\"male\",\"age\":39.0,\"sibsp\":0,\"parch\":0,\"ticket\":\"112050\",\"fare\":0.0,\"cabin\":\"A36\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Belfast, NI\"},{\"pclass\":1,\"survived\":1,\"name\":\"Appleton, Mrs. Edward Dale (Charlotte Lamson)\",\"sex\":\"female\",\"age\":53.0,\"sibsp\":2,\"parch\":0,\"ticket\":\"11769\",\"fare\":51.4792,\"cabin\":\"C101\",\"embarked\":\"S\",\"boat\":\"D\",\"body\":null,\"homedest\":\"Bayside, Queens, NY\"},{\"pclass\":1,\"survived\":0,\"name\":\"Artagaveytia, Mr. Ramon\",\"sex\":\"male\",\"age\":71.0,\"sibsp\":0,\"parch\":0,\"ticket\":\"PC 17609\",\"fare\":49.5042,\"cabin\":null,\"embarked\":\"C\",\"boat\":null,\"body\":22,\"homedest\":\"Montevideo, Uruguay\"},{\"pclass\":1,\"survived\":0,\"name\":\"Astor, Col. John Jacob\",\"sex\":\"male\",\"age\":47.0,\"sibsp\":1,\"parch\":0,\"ticket\":\"PC 17757\",\"fare\":227.525,\"cabin\":\"C62 C64\",\"embarked\":\"C\",\"boat\":null,\"body\":124,\"homedest\":\"New York, NY\"},{\"pclass\":1,\"survived\":1,\"name\":\"Astor, Mrs. John Jacob (Madeleine Talmadge Force)\",\"sex\":\"female\",\"age\":18.0,\"sibsp\":1,\"parch\":0,\"ticket\":\"PC 17757\",\"fare\":227.525,\"cabin\":\"C62 C64\",\"embarked\":\"C\",\"boat\":\"4\",\"body\":null,\"homedest\":\"New York, NY\"},{\"pclass\":1,\"survived\":1,\"name\":\"Aubart, Mme. Leontine Pauline\",\"sex\":\"female\",\"age\":24.0,\"sibsp\":0,\"parch\":0,\"ticket\":\"PC 17477\",\"fare\":69.3,\"cabin\":\"B35\",\"embarked\":\"C\",\"boat\":\"9\",\"body\":null,\"homedest\":\"Paris, France\"},{\"pclass\":1,\"survived\":1,\"name\":\"Barber, Miss. Ellen \\\"Nellie\\\"\",\"sex\":\"female\",\"age\":26.0,\"sibsp\":0,\"parch\":0,\"ticket\":\"19877\",\"fare\":78.85,\"cabin\":null,\"embarked\":\"S\",\"boat\":\"6\",\"body\":null,\"homedest\":null},{\"pclass\":1,\"survived\":1,\"name\":\"Barkworth, Mr. Algernon Henry Wilson\",\"sex\":\"male\",\"age\":80.0,\"sibsp\":0,\"parch\":0,\"ticket\":\"27042\",\"fare\":30.0,\"cabin\":\"A23\",\"embarked\":\"S\",\"boat\":\"B\",\"body\":null,\"homedest\":\"Hessle, Yorks\"},{\"pclass\":1,\"survived\":0,\"name\":\"Baumann, Mr. John D\",\"sex\":\"male\",\"age\":null,\"sibsp\":0,\"parch\":0,\"ticket\":\"PC 17318\",\"fare\":25.925,\"cabin\":null,\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"New York, NY\"},{\"pclass\":1,\"survived\":0,\"name\":\"Baxter, Mr. Quigg Edmond\",\"sex\":\"male\",\"age\":24.0,\"sibsp\":0,\"parch\":1,\"ticket\":\"PC 17558\",\"fare\":247.5208,\"cabin\":\"B58 B60\",\"embarked\":\"C\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ\"},{\"pclass\":1,\"survived\":1,\"name\":\"Baxter, Mrs. James (Helene DeLaudeniere Chaput)\",\"sex\":\"female\",\"age\":50.0,\"sibsp\":0,\"parch\":1,\"ticket\":\"PC 17558\",\"fare\":247.5208,\"cabin\":\"B58 B60\",\"embarked\":\"C\",\"boat\":\"6\",\"body\":null,\"homedest\":\"Montreal, PQ\"},{\"pclass\":1,\"survived\":1,\"name\":\"Bazzani, Miss. Albina\",\"sex\":\"female\",\"age\":32.0,\"sibsp\":0,\"parch\":0,\"ticket\":\"11813\",\"fare\":76.2917,\"cabin\":\"D15\",\"embarked\":\"C\",\"boat\":\"8\",\"body\":null,\"homedest\":null},{\"pclass\":1,\"survived\":0,\"name\":\"Beattie, Mr. Thomson\",\"sex\":\"male\",\"age\":36.0,\"sibsp\":0,\"parch\":0,\"ticket\":\"13050\",\"fare\":75.2417,\"cabin\":\"C6\",\"embarked\":\"C\",\"boat\":\"A\",\"body\":null,\"homedest\":\"Winnipeg, MN\"}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 4 } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Imputing null values\n", "Let's convert all columns of our dataset to non-nullable and impute null values based on mean values." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:19.051617624Z", "start_time": "2023-12-05T11:12:15.952294903Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhomedest
11Allen, Miss. Elisabeth Waltonfemale29.0000000.0000000.00000024160211.337500B5S2nullSt Louis, MO
11Allison, Master. Hudson Trevormale0.9167001.0000002.000000113781151.550000C22 C26AA11nullMontreal, PQ / Chesterville, ON
10Allison, Miss. Helen Lorainefemale2.0000001.0000002.000000113781151.550000C22 C26SnullnullMontreal, PQ / Chesterville, ON
10Allison, Mr. Hudson Joshua Creightonmale30.0000001.0000002.000000113781151.550000C22 C26Snull135Montreal, PQ / Chesterville, ON
10Allison, Mrs. Hudson J C (Bessie Wald...female25.0000001.0000002.000000113781151.550000C22 C26SnullnullMontreal, PQ / Chesterville, ON
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":14,\"columns\":[\"pclass\",\"survived\",\"name\",\"sex\",\"age\",\"sibsp\",\"parch\",\"ticket\",\"fare\",\"cabin\",\"embarked\",\"boat\",\"body\",\"homedest\"],\"kotlin_dataframe\":[{\"pclass\":1,\"survived\":1,\"name\":\"Allen, Miss. Elisabeth Walton\",\"sex\":\"female\",\"age\":29.0,\"sibsp\":0.0,\"parch\":0.0,\"ticket\":\"24160\",\"fare\":211.3375,\"cabin\":\"B5\",\"embarked\":\"S\",\"boat\":\"2\",\"body\":null,\"homedest\":\"St Louis, MO\"},{\"pclass\":1,\"survived\":1,\"name\":\"Allison, Master. Hudson Trevor\",\"sex\":\"male\",\"age\":0.9167,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"AA\",\"boat\":\"11\",\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Miss. Helen Loraine\",\"sex\":\"female\",\"age\":2.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Mr. Hudson Joshua Creighton\",\"sex\":\"male\",\"age\":30.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":135,\"homedest\":\"Montreal, PQ / Chesterville, ON\"},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)\",\"sex\":\"female\",\"age\":25.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\"}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 5 } ], "source": [ "val df1 = df\n", " // imputing\n", " .fillNulls { sibsp and parch and age and fare }.perCol { mean() }\n", " .fillNulls { sex }.with { \"female\" }\n", " .fillNulls { embarked }.with { \"S\" }\n", " .convert { sibsp and parch and age and fare }.toDouble()\n", "\n", "df1.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:19.263490381Z", "start_time": "2023-12-05T11:12:19.036527845Z" } }, "outputs": [ { "data": { "text/plain": [ "pclass: Int\n", "survived: Int\n", "name: String\n", "sex: String\n", "age: Double\n", "sibsp: Double\n", "parch: Double\n", "ticket: String\n", "fare: Double\n", "cabin: String?\n", "embarked: String\n", "boat: String?\n", "body: Int?\n", "homedest: String?" ] }, "metadata": {}, "output_type": "execute_result", "execution_count": 6 } ], "source": [ "df1.schema()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:19.754150514Z", "start_time": "2023-12-05T11:12:19.155117622Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
columnpclasssurvivedagesibspparchfare
pclass1.000000-0.312469-0.3663700.0608320.018322-0.558477
survived-0.3124691.000000-0.050199-0.0278250.0826600.244208
age-0.366370-0.0501991.000000-0.190747-0.1308720.171521
sibsp0.060832-0.027825-0.1907471.0000000.3735870.160224
parch0.0183220.082660-0.1308720.3735871.0000000.221522
fare-0.5584770.2442080.1715210.1602240.2215221.000000
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":6,\"ncol\":7,\"columns\":[\"column\",\"pclass\",\"survived\",\"age\",\"sibsp\",\"parch\",\"fare\"],\"kotlin_dataframe\":[{\"column\":\"pclass\",\"pclass\":1.0,\"survived\":-0.31246936264968,\"age\":-0.36637035869802936,\"sibsp\":0.06083200757490747,\"parch\":0.018322202009786036,\"fare\":-0.5584773475043957},{\"column\":\"survived\",\"pclass\":-0.31246936264968,\"survived\":1.0,\"age\":-0.050198983636982906,\"sibsp\":-0.02782511923058273,\"parch\":0.0826595703861011,\"fare\":0.24420775279437662},{\"column\":\"age\",\"pclass\":-0.36637035869802936,\"survived\":-0.050198983636982906,\"age\":1.0,\"sibsp\":-0.19074715633383899,\"parch\":-0.1308719630307398,\"fare\":0.17152056539956614},{\"column\":\"sibsp\",\"pclass\":0.06083200757490747,\"survived\":-0.02782511923058273,\"age\":-0.19074715633383899,\"sibsp\":1.0,\"parch\":0.3735871906264913,\"fare\":0.16022419622116035},{\"column\":\"parch\",\"pclass\":0.018322202009786036,\"survived\":0.0826595703861011,\"age\":-0.1308719630307398,\"sibsp\":0.3735871906264913,\"parch\":1.0,\"fare\":0.2215218879995723},{\"column\":\"fare\",\"pclass\":-0.5584773475043957,\"survived\":0.24420775279437662,\"age\":0.17152056539956614,\"sibsp\":0.16022419622116035,\"parch\":0.2215218879995723,\"fare\":1.0}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 7 } ], "source": [ "df1.corr()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:20.355589884Z", "start_time": "2023-12-05T11:12:19.319432427Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
columnsurvived
pclass-0.312469
age-0.050199
sibsp-0.027825
parch0.082660
fare0.244208
survived1.000000
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":6,\"ncol\":2,\"columns\":[\"column\",\"survived\"],\"kotlin_dataframe\":[{\"column\":\"pclass\",\"survived\":-0.31246936264968},{\"column\":\"age\",\"survived\":-0.050198983636982906},{\"column\":\"sibsp\",\"survived\":-0.02782511923058273},{\"column\":\"parch\",\"survived\":0.0826595703861011},{\"column\":\"fare\",\"survived\":0.24420775279437662},{\"column\":\"survived\",\"survived\":1.0}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 8 } ], "source": [ "val correlations = df1\n", " .corr { all() }.with { survived }\n", " .sortBy { survived }\n", "correlations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Great, at this moment we have 5 numerical features available for numerical analysis: **pclass, age, sibsp, parch, fare**." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Analyze by pivoting features\n", "To confirm some of our observations and assumptions, we can quickly analyze our feature correlations by pivoting features against each other. We can only do so at this stage for features which do not have any empty values. It also makes sense doing so only for features which are categorical (Sex), ordinal (Pclass) or discrete (SibSp, Parch) type.\n", "\n", "- **Pclass**: We observe significant correlation (>0.5) among **Pclass**=1 and **Survived**.\n", "\n", "- **Sex**: We confirm the observation during problem definition that Sex=female had a very high survival rate at 74%.\n", "\n", "- **SibSp** and **Parch**: These features have zero correlation for the certain values. It may be best to derive a feature or a set of features from these individual features." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:20.500754354Z", "start_time": "2023-12-05T11:12:19.936122111Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvived
10.619195
20.429603
30.255289
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":3,\"ncol\":2,\"columns\":[\"pclass\",\"survived\"],\"kotlin_dataframe\":[{\"pclass\":1,\"survived\":0.6191950464396285},{\"pclass\":2,\"survived\":0.4296028880866426},{\"pclass\":3,\"survived\":0.2552891396332863}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 9 } ], "source": [ "df1.groupBy { pclass }.mean { survived }.sortBy { pclass }" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:20.861360454Z", "start_time": "2023-12-05T11:12:20.226131750Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
sexsurvived
male0.190985
female0.727468
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":2,\"ncol\":2,\"columns\":[\"sex\",\"survived\"],\"kotlin_dataframe\":[{\"sex\":\"male\",\"survived\":0.19098457888493475},{\"sex\":\"female\",\"survived\":0.7274678111587983}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 10 } ], "source": [ "df1.groupBy { sex }.mean { survived }.sortBy { survived }" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:21.575289484Z", "start_time": "2023-12-05T11:12:20.434891014Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
sibspsurvived
0.0000000.346801
1.0000000.510972
2.0000000.452381
3.0000000.300000
4.0000000.136364
5.0000000.000000
8.0000000.000000
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":7,\"ncol\":2,\"columns\":[\"sibsp\",\"survived\"],\"kotlin_dataframe\":[{\"sibsp\":0.0,\"survived\":0.3468013468013468},{\"sibsp\":1.0,\"survived\":0.5109717868338558},{\"sibsp\":2.0,\"survived\":0.4523809523809524},{\"sibsp\":3.0,\"survived\":0.3},{\"sibsp\":4.0,\"survived\":0.13636363636363635},{\"sibsp\":5.0,\"survived\":0.0},{\"sibsp\":8.0,\"survived\":0.0}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 11 } ], "source": [ "df1.groupBy { sibsp }.mean { survived }.sortBy { sibsp }" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:21.823175489Z", "start_time": "2023-12-05T11:12:20.640771224Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
parchsurvived
0.0000000.335329
1.0000000.588235
2.0000000.504425
3.0000000.625000
4.0000000.166667
5.0000000.166667
6.0000000.000000
9.0000000.000000
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":8,\"ncol\":2,\"columns\":[\"parch\",\"survived\"],\"kotlin_dataframe\":[{\"parch\":0.0,\"survived\":0.33532934131736525},{\"parch\":1.0,\"survived\":0.5882352941176471},{\"parch\":2.0,\"survived\":0.504424778761062},{\"parch\":3.0,\"survived\":0.625},{\"parch\":4.0,\"survived\":0.16666666666666666},{\"parch\":5.0,\"survived\":0.16666666666666666},{\"parch\":6.0,\"survived\":0.0},{\"parch\":9.0,\"survived\":0.0}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 12 } ], "source": [ "df1.groupBy { parch }.mean { survived }.sortBy { parch }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Analyze the importance of the Age feature\n", "\n", "It's interesting to discover both **age** distributions: among survived and not survived passengers." ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:22.108173663Z", "start_time": "2023-12-05T11:12:20.881718639Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
agecount
0.1667001
0.3333001
0.4167001
0.6667001
0.7500003
0.8333003
0.9167002
1.00000010
2.00000012
3.0000007
4.00000010
5.0000005
6.0000006
7.0000004
8.0000006
9.00000010
10.0000004
11.0000004
11.5000001
12.0000003
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":99,\"ncol\":2,\"columns\":[\"age\",\"count\"],\"kotlin_dataframe\":[{\"age\":0.1667,\"count\":1},{\"age\":0.3333,\"count\":1},{\"age\":0.4167,\"count\":1},{\"age\":0.6667,\"count\":1},{\"age\":0.75,\"count\":3},{\"age\":0.8333,\"count\":3},{\"age\":0.9167,\"count\":2},{\"age\":1.0,\"count\":10},{\"age\":2.0,\"count\":12},{\"age\":3.0,\"count\":7},{\"age\":4.0,\"count\":10},{\"age\":5.0,\"count\":5},{\"age\":6.0,\"count\":6},{\"age\":7.0,\"count\":4},{\"age\":8.0,\"count\":6},{\"age\":9.0,\"count\":10},{\"age\":10.0,\"count\":4},{\"age\":11.0,\"count\":4},{\"age\":11.5,\"count\":1},{\"age\":12.0,\"count\":3}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 13 } ], "source": [ "val byAge = df1.valueCounts { age }.sortBy { age }\n", "byAge" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:22.359463114Z", "start_time": "2023-12-05T11:12:21.495527293Z" } }, "outputs": [], "source": [ "// JetBrains color palette\n", "val colors = mapOf(\n", " \"light_orange\" to \"#ffb59e\",\n", " \"orange\" to \"#ff6632\",\n", " \"light_grey\" to \"#a6a6a6\",\n", " \"dark_grey\" to \"#4c4c4c\",\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:23.089524496Z", "start_time": "2023-12-05T11:12:21.639050969Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "execute_result", "execution_count": 15 } ], "source": [ "letsPlot(byAge.toMap()) { x = \"age\"; y = \"count\" } +\n", " geomPoint(size = 5, color = colors[\"dark_grey\"]) +\n", " ggsize(width = 850, height = 500)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:23.630345133Z", "start_time": "2023-12-05T11:12:22.242169028Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "execute_result", "execution_count": 16 } ], "source": [ "val age = df.select { age }.dropNulls().sortBy { age }\n", "\n", "letsPlot(age.toMap()) { x = \"age\" } + \n", " geomHistogram(binWidth = 5, fill = colors[\"orange\"]) + \n", " ggsize(width = 850, height = 500)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:24.013018994Z", "start_time": "2023-12-05T11:12:22.815315821Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
agesurvived
10
0.16670010
0.33330001
0.41670010
0.66670010
0.75000021
0.83330030
0.91670020
1.00000073
2.00000048
3.00000052
4.00000073
5.00000041
6.00000033
7.00000022
8.00000042
9.00000046
10.00000004
11.00000013
11.50000001
12.00000030
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":99,\"ncol\":2,\"columns\":[\"age\",\"survived\"],\"kotlin_dataframe\":[{\"age\":0.1667,\"survived\":{\"1\":1,\"0\":0}},{\"age\":0.3333,\"survived\":{\"1\":0,\"0\":1}},{\"age\":0.4167,\"survived\":{\"1\":1,\"0\":0}},{\"age\":0.6667,\"survived\":{\"1\":1,\"0\":0}},{\"age\":0.75,\"survived\":{\"1\":2,\"0\":1}},{\"age\":0.8333,\"survived\":{\"1\":3,\"0\":0}},{\"age\":0.9167,\"survived\":{\"1\":2,\"0\":0}},{\"age\":1.0,\"survived\":{\"1\":7,\"0\":3}},{\"age\":2.0,\"survived\":{\"1\":4,\"0\":8}},{\"age\":3.0,\"survived\":{\"1\":5,\"0\":2}},{\"age\":4.0,\"survived\":{\"1\":7,\"0\":3}},{\"age\":5.0,\"survived\":{\"1\":4,\"0\":1}},{\"age\":6.0,\"survived\":{\"1\":3,\"0\":3}},{\"age\":7.0,\"survived\":{\"1\":2,\"0\":2}},{\"age\":8.0,\"survived\":{\"1\":4,\"0\":2}},{\"age\":9.0,\"survived\":{\"1\":4,\"0\":6}},{\"age\":10.0,\"survived\":{\"1\":0,\"0\":4}},{\"age\":11.0,\"survived\":{\"1\":1,\"0\":3}},{\"age\":11.5,\"survived\":{\"1\":0,\"0\":1}},{\"age\":12.0,\"survived\":{\"1\":3,\"0\":0}}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 17 } ], "source": [ "df1.groupBy { age }.pivotCounts { survived }.sortBy { age }" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:24.334865432Z", "start_time": "2023-12-05T11:12:23.154913794Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
survivedage
10.166700
00.333300
10.416700
10.666700
10.750000
10.750000
00.750000
10.833300
10.833300
10.833300
10.916700
10.916700
11.000000
11.000000
11.000000
11.000000
01.000000
11.000000
01.000000
11.000000
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":1309,\"ncol\":2,\"columns\":[\"survived\",\"age\"],\"kotlin_dataframe\":[{\"survived\":1,\"age\":0.1667},{\"survived\":0,\"age\":0.3333},{\"survived\":1,\"age\":0.4167},{\"survived\":1,\"age\":0.6667},{\"survived\":1,\"age\":0.75},{\"survived\":1,\"age\":0.75},{\"survived\":0,\"age\":0.75},{\"survived\":1,\"age\":0.8333},{\"survived\":1,\"age\":0.8333},{\"survived\":1,\"age\":0.8333},{\"survived\":1,\"age\":0.9167},{\"survived\":1,\"age\":0.9167},{\"survived\":1,\"age\":1.0},{\"survived\":1,\"age\":1.0},{\"survived\":1,\"age\":1.0},{\"survived\":1,\"age\":1.0},{\"survived\":0,\"age\":1.0},{\"survived\":1,\"age\":1.0},{\"survived\":0,\"age\":1.0},{\"survived\":1,\"age\":1.0}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 18 } ], "source": [ "val survivedByAge = df1.select { survived and age }.sortBy { age }\n", "survivedByAge" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:24.604154048Z", "start_time": "2023-12-05T11:12:23.566174848Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "execute_result", "execution_count": 19 } ], "source": [ "val plot = letsPlot(\n", " survivedByAge\n", " .convert { survived }.with { if (it == 1) \"Survived\" else \"Died\" }\n", " .toMap()\n", ")\n", "\n", "plot +\n", " geomHistogram(\n", " binWidth = 5,\n", " alpha = 0.7,\n", " position = positionDodge(),\n", " ) { x = \"age\"; fill = \"survived\" } +\n", " scaleFillManual(\n", " values = listOf(colors[\"dark_grey\"]!!, colors[\"orange\"]!!),\n", " ) +\n", " ggsize(width = 850, height = 500)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:24.969425888Z", "start_time": "2023-12-05T11:12:24.142220255Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "execute_result", "execution_count": 20 } ], "source": [ "// Density plot\n", "plot +\n", " geomDensity { x = \"age\"; color = \"survived\" } +\n", " scaleColorManual(values = listOf(colors[\"dark_grey\"]!!, colors[\"orange\"]!!)) +\n", " ggsize(width = 850, height = 250)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:25.602308835Z", "start_time": "2023-12-05T11:12:24.577080079Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "execute_result", "execution_count": 21 } ], "source": [ "// A basic box plot\n", "plot +\n", " geomBoxplot { x = \"survived\"; y = \"age\"; fill = \"survived\" } +\n", " scaleFillManual(values = listOf(colors[\"dark_grey\"]!!, colors[\"orange\"]!!)) +\n", " ggsize(width = 500, height = 400)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Seems like we have the same age distribution among survived and not survived passengers." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Categorical features with One Hot Encoding\n", "\n", "To prepare data for the ML algorithms, we should replace all String values in categorical features on numbers. There are a few ways of how to preprocess categorical features, and One Hot Encoding is one of them. We will use [`pivotMatches`](https://kotlin.github.io/dataframe/pivot.html#pivotmatches) operation to convert categorical columns into sets of nested `Boolean` columns per every unique value." ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:28.079526575Z", "start_time": "2023-12-05T11:12:24.922344734Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
survivednameagesibspparchticketfarecabinboatbodyhomedestpclasssexembarked
123femalemaleSAACQ
1Allen, Miss. Elisabeth Walton29.0000000.0000000.00000024160211.337500B52nullSt Louis, MOtruefalsefalsetruefalsetruefalsefalsefalse
1Allison, Master. Hudson Trevor0.9167001.0000002.000000113781151.550000C22 C2611nullMontreal, PQ / Chesterville, ONtruefalsefalsefalsetruefalsetruefalsefalse
0Allison, Miss. Helen Loraine2.0000001.0000002.000000113781151.550000C22 C26nullnullMontreal, PQ / Chesterville, ONtruefalsefalsetruefalsetruefalsefalsefalse
0Allison, Mr. Hudson Joshua Creighton30.0000001.0000002.000000113781151.550000C22 C26null135Montreal, PQ / Chesterville, ONtruefalsefalsefalsetruetruefalsefalsefalse
0Allison, Mrs. Hudson J C (Bessie Wald...25.0000001.0000002.000000113781151.550000C22 C26nullnullMontreal, PQ / Chesterville, ONtruefalsefalsetruefalsetruefalsefalsefalse
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":14,\"columns\":[\"survived\",\"name\",\"age\",\"sibsp\",\"parch\",\"ticket\",\"fare\",\"cabin\",\"boat\",\"body\",\"homedest\",\"pclass\",\"sex\",\"embarked\"],\"kotlin_dataframe\":[{\"survived\":1,\"name\":\"Allen, Miss. Elisabeth Walton\",\"age\":29.0,\"sibsp\":0.0,\"parch\":0.0,\"ticket\":\"24160\",\"fare\":211.3375,\"cabin\":\"B5\",\"boat\":\"2\",\"body\":null,\"homedest\":\"St Louis, MO\",\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sex\":{\"female\":true,\"male\":false},\"embarked\":{\"S\":true,\"AA\":false,\"C\":false,\"Q\":false}},{\"survived\":1,\"name\":\"Allison, Master. Hudson Trevor\",\"age\":0.9167,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"boat\":\"11\",\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\",\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sex\":{\"female\":false,\"male\":true},\"embarked\":{\"S\":false,\"AA\":true,\"C\":false,\"Q\":false}},{\"survived\":0,\"name\":\"Allison, Miss. Helen Loraine\",\"age\":2.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\",\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sex\":{\"female\":true,\"male\":false},\"embarked\":{\"S\":true,\"AA\":false,\"C\":false,\"Q\":false}},{\"survived\":0,\"name\":\"Allison, Mr. Hudson Joshua Creighton\",\"age\":30.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"boat\":null,\"body\":135,\"homedest\":\"Montreal, PQ / Chesterville, ON\",\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sex\":{\"female\":false,\"male\":true},\"embarked\":{\"S\":true,\"AA\":false,\"C\":false,\"Q\":false}},{\"survived\":0,\"name\":\"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)\",\"age\":25.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\",\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sex\":{\"female\":true,\"male\":false},\"embarked\":{\"S\":true,\"AA\":false,\"C\":false,\"Q\":false}}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 22 } ], "source": [ "val pivoted = df1.pivotMatches { pclass and sex and embarked }\n", "pivoted.head()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:28.954959988Z", "start_time": "2023-12-05T11:12:27.984827269Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
survivedpclasssibspparchagefaresexembarked
123femalemaleSAACQ
1.000000truefalsefalse0.0000000.00000029.000000211.337500truefalsetruefalsefalsefalse
1.000000truefalsefalse1.0000002.0000000.916700151.550000falsetruefalsetruefalsefalse
0.000000truefalsefalse1.0000002.0000002.000000151.550000truefalsetruefalsefalsefalse
0.000000truefalsefalse1.0000002.00000030.000000151.550000falsetruetruefalsefalsefalse
0.000000truefalsefalse1.0000002.00000025.000000151.550000truefalsetruefalsefalsefalse
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":8,\"columns\":[\"survived\",\"pclass\",\"sibsp\",\"parch\",\"age\",\"fare\",\"sex\",\"embarked\"],\"kotlin_dataframe\":[{\"survived\":1.0,\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sibsp\":0.0,\"parch\":0.0,\"age\":29.0,\"fare\":211.3375,\"sex\":{\"female\":true,\"male\":false},\"embarked\":{\"S\":true,\"AA\":false,\"C\":false,\"Q\":false}},{\"survived\":1.0,\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sibsp\":1.0,\"parch\":2.0,\"age\":0.9167,\"fare\":151.55,\"sex\":{\"female\":false,\"male\":true},\"embarked\":{\"S\":false,\"AA\":true,\"C\":false,\"Q\":false}},{\"survived\":0.0,\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sibsp\":1.0,\"parch\":2.0,\"age\":2.0,\"fare\":151.55,\"sex\":{\"female\":true,\"male\":false},\"embarked\":{\"S\":true,\"AA\":false,\"C\":false,\"Q\":false}},{\"survived\":0.0,\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sibsp\":1.0,\"parch\":2.0,\"age\":30.0,\"fare\":151.55,\"sex\":{\"female\":false,\"male\":true},\"embarked\":{\"S\":true,\"AA\":false,\"C\":false,\"Q\":false}},{\"survived\":0.0,\"pclass\":{\"1\":true,\"2\":false,\"3\":false},\"sibsp\":1.0,\"parch\":2.0,\"age\":25.0,\"fare\":151.55,\"sex\":{\"female\":true,\"male\":false},\"embarked\":{\"S\":true,\"AA\":false,\"C\":false,\"Q\":false}}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 23 } ], "source": [ "val df2 = pivoted\n", " // feature extraction\n", " .select { cols(survived, pclass, sibsp, parch, age, fare, sex, embarked) }\n", " .convert { cols { it.isValueColumn() } /* TODO: change to valueCols() after Selection DSL overhaul */ }.toDouble()\n", "\n", "df2.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:29.932396400Z", "start_time": "2023-12-05T11:12:28.940970940Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " " ] }, "metadata": {}, "output_type": "execute_result", "execution_count": 24 } ], "source": [ "val titanicData = df2.flatten().toMap()\n", "\n", "val plots = listOf(\n", " CorrPlot(titanicData, \"Tiles\")\n", " .tiles()\n", " .paletteGradient(colors[\"orange\"]!!, colors[\"light_grey\"]!!, colors[\"dark_grey\"]!!)\n", " .build(),\n", " CorrPlot(titanicData, \"Points\")\n", " .points()\n", " .paletteGradient(colors[\"orange\"]!!, colors[\"light_grey\"]!!, colors[\"dark_grey\"]!!)\n", " .build(),\n", " CorrPlot(titanicData, \"Tiles and labels\")\n", " .tiles()\n", " .labels()\n", " .paletteGradient(colors[\"orange\"]!!, colors[\"light_grey\"]!!, colors[\"dark_grey\"]!!)\n", " .build(),\n", " CorrPlot(titanicData, \"Tiles, points and labels\").points().labels()\n", " .tiles()\n", " .paletteGradient(colors[\"orange\"]!!, colors[\"light_grey\"]!!, colors[\"dark_grey\"]!!)\n", " .build()\n", ")\n", "val widths = listOf(700)\n", "val heights = plots.map { 600 }\n", "\n", "gggrid(plots = plots, ncol = 1, widths = widths, heights = heights)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Creation of new features\n", "\n", "We suggest to combine both, **Sibsp** and **parch** features, into the new one feature with the name **FamilyNumber** as a simple sum of **sibsp** and **parch**." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:30.542949242Z", "start_time": "2023-12-05T11:12:29.917730243Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhomedestfamilyNumber
11Allen, Miss. Elisabeth Waltonfemale29.0000000.0000000.00000024160211.337500B5S2nullSt Louis, MO0.000000
11Allison, Master. Hudson Trevormale0.9167001.0000002.000000113781151.550000C22 C26AA11nullMontreal, PQ / Chesterville, ON3.000000
10Allison, Miss. Helen Lorainefemale2.0000001.0000002.000000113781151.550000C22 C26SnullnullMontreal, PQ / Chesterville, ON3.000000
10Allison, Mr. Hudson Joshua Creightonmale30.0000001.0000002.000000113781151.550000C22 C26Snull135Montreal, PQ / Chesterville, ON3.000000
10Allison, Mrs. Hudson J C (Bessie Wald...female25.0000001.0000002.000000113781151.550000C22 C26SnullnullMontreal, PQ / Chesterville, ON3.000000
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":15,\"columns\":[\"pclass\",\"survived\",\"name\",\"sex\",\"age\",\"sibsp\",\"parch\",\"ticket\",\"fare\",\"cabin\",\"embarked\",\"boat\",\"body\",\"homedest\",\"familyNumber\"],\"kotlin_dataframe\":[{\"pclass\":1,\"survived\":1,\"name\":\"Allen, Miss. Elisabeth Walton\",\"sex\":\"female\",\"age\":29.0,\"sibsp\":0.0,\"parch\":0.0,\"ticket\":\"24160\",\"fare\":211.3375,\"cabin\":\"B5\",\"embarked\":\"S\",\"boat\":\"2\",\"body\":null,\"homedest\":\"St Louis, MO\",\"familyNumber\":0.0},{\"pclass\":1,\"survived\":1,\"name\":\"Allison, Master. Hudson Trevor\",\"sex\":\"male\",\"age\":0.9167,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"AA\",\"boat\":\"11\",\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\",\"familyNumber\":3.0},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Miss. Helen Loraine\",\"sex\":\"female\",\"age\":2.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\",\"familyNumber\":3.0},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Mr. Hudson Joshua Creighton\",\"sex\":\"male\",\"age\":30.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":135,\"homedest\":\"Montreal, PQ / Chesterville, ON\",\"familyNumber\":3.0},{\"pclass\":1,\"survived\":0,\"name\":\"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)\",\"sex\":\"female\",\"age\":25.0,\"sibsp\":1.0,\"parch\":2.0,\"ticket\":\"113781\",\"fare\":151.55,\"cabin\":\"C22 C26\",\"embarked\":\"S\",\"boat\":null,\"body\":null,\"homedest\":\"Montreal, PQ / Chesterville, ON\",\"familyNumber\":3.0}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 25 } ], "source": [ "val familyDF = df1.add(\"familyNumber\") { sibsp + parch }\n", "\n", "familyDF.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:30.792559672Z", "start_time": "2023-12-05T11:12:30.420655130Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
columnsurvived
familyNumber0.026876
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":1,\"ncol\":2,\"columns\":[\"column\",\"survived\"],\"kotlin_dataframe\":[{\"column\":\"familyNumber\",\"survived\":0.02687643412533192}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 26 } ], "source": [ "familyDF.corr { familyNumber }.with { survived }" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:31.263208547Z", "start_time": "2023-12-05T11:12:30.592811049Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
columnage
familyNumber-0.196996
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":1,\"ncol\":2,\"columns\":[\"column\",\"age\"],\"kotlin_dataframe\":[{\"column\":\"familyNumber\",\"age\":-0.19699624168458799}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 27 } ], "source": [ "familyDF.corr { familyNumber }.with { age }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Looks like the new feature has no influence on the **survived** column, but it has a strong negative correlation with **age**. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Titles\n", "Let's try to extract something from the names. A lot of string in the name column contains special titles, like Done, Mr, Mrs and so on." ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:32.228096281Z", "start_time": "2023-12-05T11:12:30.811725316Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
survivednametitle
1Allen, Miss. Elisabeth WaltonMiss
1Allison, Master. Hudson TrevorMaster
0Allison, Miss. Helen LoraineMiss
0Allison, Mr. Hudson Joshua CreightonMr
0Allison, Mrs. Hudson J C (Bessie Wald...Mrs
1Anderson, Mr. HarryMr
1Andrews, Miss. Kornelia TheodosiaMiss
0Andrews, Mr. Thomas JrMr
1Appleton, Mrs. Edward Dale (Charlotte...Mrs
0Artagaveytia, Mr. RamonMr
0Astor, Col. John JacobCol
1Astor, Mrs. John Jacob (Madeleine Tal...Mrs
1Aubart, Mme. Leontine PaulineMme
1Barber, Miss. Ellen "Nellie"Miss
1Barkworth, Mr. Algernon Henry WilsonMr
0Baumann, Mr. John DMr
0Baxter, Mr. Quigg EdmondMr
1Baxter, Mrs. James (Helene DeLaudenie...Mrs
1Bazzani, Miss. AlbinaMiss
0Beattie, Mr. ThomsonMr
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":100,\"ncol\":3,\"columns\":[\"survived\",\"name\",\"title\"],\"kotlin_dataframe\":[{\"survived\":1,\"name\":\"Allen, Miss. Elisabeth Walton\",\"title\":\"Miss\"},{\"survived\":1,\"name\":\"Allison, Master. Hudson Trevor\",\"title\":\"Master\"},{\"survived\":0,\"name\":\"Allison, Miss. Helen Loraine\",\"title\":\"Miss\"},{\"survived\":0,\"name\":\"Allison, Mr. Hudson Joshua Creighton\",\"title\":\"Mr\"},{\"survived\":0,\"name\":\"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)\",\"title\":\"Mrs\"},{\"survived\":1,\"name\":\"Anderson, Mr. Harry\",\"title\":\"Mr\"},{\"survived\":1,\"name\":\"Andrews, Miss. Kornelia Theodosia\",\"title\":\"Miss\"},{\"survived\":0,\"name\":\"Andrews, Mr. Thomas Jr\",\"title\":\"Mr\"},{\"survived\":1,\"name\":\"Appleton, Mrs. Edward Dale (Charlotte Lamson)\",\"title\":\"Mrs\"},{\"survived\":0,\"name\":\"Artagaveytia, Mr. Ramon\",\"title\":\"Mr\"},{\"survived\":0,\"name\":\"Astor, Col. John Jacob\",\"title\":\"Col\"},{\"survived\":1,\"name\":\"Astor, Mrs. John Jacob (Madeleine Talmadge Force)\",\"title\":\"Mrs\"},{\"survived\":1,\"name\":\"Aubart, Mme. Leontine Pauline\",\"title\":\"Mme\"},{\"survived\":1,\"name\":\"Barber, Miss. Ellen \\\"Nellie\\\"\",\"title\":\"Miss\"},{\"survived\":1,\"name\":\"Barkworth, Mr. Algernon Henry Wilson\",\"title\":\"Mr\"},{\"survived\":0,\"name\":\"Baumann, Mr. John D\",\"title\":\"Mr\"},{\"survived\":0,\"name\":\"Baxter, Mr. Quigg Edmond\",\"title\":\"Mr\"},{\"survived\":1,\"name\":\"Baxter, Mrs. James (Helene DeLaudeniere Chaput)\",\"title\":\"Mrs\"},{\"survived\":1,\"name\":\"Bazzani, Miss. Albina\",\"title\":\"Miss\"},{\"survived\":0,\"name\":\"Beattie, Mr. Thomson\",\"title\":\"Mr\"}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 28 } ], "source": [ "val titledDF = df\n", " .select { survived and name }\n", " .add(\"title\") {\n", " name.split(\".\")[0].split(\",\")[1].trim()\n", " }\n", "titledDF.head(100)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:32.315360773Z", "start_time": "2023-12-05T11:12:31.581911916Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
titlecount
Mr757
Miss260
Mrs197
Master61
Dr8
Rev8
Col4
Major2
Mlle2
Ms2
Mme1
Capt1
Lady1
Sir1
Dona1
Jonkheer1
the Countess1
Don1
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":18,\"ncol\":2,\"columns\":[\"title\",\"count\"],\"kotlin_dataframe\":[{\"title\":\"Mr\",\"count\":757},{\"title\":\"Miss\",\"count\":260},{\"title\":\"Mrs\",\"count\":197},{\"title\":\"Master\",\"count\":61},{\"title\":\"Dr\",\"count\":8},{\"title\":\"Rev\",\"count\":8},{\"title\":\"Col\",\"count\":4},{\"title\":\"Major\",\"count\":2},{\"title\":\"Mlle\",\"count\":2},{\"title\":\"Ms\",\"count\":2},{\"title\":\"Mme\",\"count\":1},{\"title\":\"Capt\",\"count\":1},{\"title\":\"Lady\",\"count\":1},{\"title\":\"Sir\",\"count\":1},{\"title\":\"Dona\",\"count\":1},{\"title\":\"Jonkheer\",\"count\":1},{\"title\":\"the Countess\",\"count\":1},{\"title\":\"Don\",\"count\":1}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 29 } ], "source": [ "titledDF.valueCounts { title }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "New **Title** column contains some rare titles and some titles with typos. Let's clean the data and merge rare titles into one category." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:32.552754731Z", "start_time": "2023-12-05T11:12:31.753157131Z" } }, "outputs": [], "source": [ "val rareTitles = listOf(\n", " \"Dona\", \"Lady\", \"the Countess\", \"Capt\", \"Col\", \"Don\",\n", " \"Dr\", \"Major\", \"Rev\", \"Sir\", \"Jonkheer\",\n", ")\n", "\n", "val cleanedTitledDF = titledDF.update { title }.with {\n", " when {\n", " it == \"Mlle\" -> \"Miss\"\n", " it == \"Ms\" -> \"Miss\"\n", " it == \"Mme\" -> \"Mrs\"\n", " it in rareTitles -> \"Rare Title\"\n", " else -> it\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:32.897571465Z", "start_time": "2023-12-05T11:12:32.007074685Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
titlecount
Mr757
Miss264
Mrs198
Master61
Rare Title29
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":2,\"columns\":[\"title\",\"count\"],\"kotlin_dataframe\":[{\"title\":\"Mr\",\"count\":757},{\"title\":\"Miss\",\"count\":264},{\"title\":\"Mrs\",\"count\":198},{\"title\":\"Master\",\"count\":61},{\"title\":\"Rare Title\",\"count\":29}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 31 } ], "source": [ "cleanedTitledDF.valueCounts { title }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now it looks awesome and we have only 5 different titles and could see how it correlates with survival." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:33.262849935Z", "start_time": "2023-12-05T11:12:32.134725743Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
titlesurvived
Miss0.306069
Master0.057318
Mr-0.528775
Mrs0.352536
Rare Title-0.000915
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":2,\"columns\":[\"title\",\"survived\"],\"kotlin_dataframe\":[{\"title\":\"Miss\",\"survived\":0.30606871573905226},{\"title\":\"Master\",\"survived\":0.0573179698378937},{\"title\":\"Mr\",\"survived\":-0.5287747518050332},{\"title\":\"Mrs\",\"survived\":0.3525356336629826},{\"title\":\"Rare Title\",\"survived\":-9.149409567074339E-4}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 32 } ], "source": [ "val correlations = cleanedTitledDF\n", " .pivotMatches { title }\n", " .corr { title }.with { survived }\n", "correlations" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:33.628583500Z", "start_time": "2023-12-05T11:12:32.774996781Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
titlesurvived
Miss0.306069
Master0.057318
Mr-0.528775
Mrs0.352536
Rare Title-0.000915
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":2,\"columns\":[\"title\",\"survived\"],\"kotlin_dataframe\":[{\"title\":\"Miss\",\"survived\":0.30606871573905226},{\"title\":\"Master\",\"survived\":0.0573179698378937},{\"title\":\"Mr\",\"survived\":-0.5287747518050332},{\"title\":\"Mrs\",\"survived\":0.3525356336629826},{\"title\":\"Rare Title\",\"survived\":-9.149409567074339E-4}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 33 } ], "source": [ "correlations\n", " .update { title }.with { it.substringAfter('_') }\n", " .filter { title != \"survived\" }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The women with title **Miss** and **Mrs** have the same chances to survive, but not the same for the men. If you have a title **Mr**, your deals are bad on the Titanic.\n", "\n", "**Rare title** is really rare and doesn't play a big role." ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:34.182463055Z", "start_time": "2023-12-05T11:12:32.962632438Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
titlesurvivedcount
Master030
Master131
Miss085
Miss1179
Mr0634
Mr1123
Mrs042
Mrs1156
Rare Title018
Rare Title111
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":10,\"ncol\":3,\"columns\":[\"title\",\"survived\",\"count\"],\"kotlin_dataframe\":[{\"title\":\"Master\",\"survived\":0,\"count\":30},{\"title\":\"Master\",\"survived\":1,\"count\":31},{\"title\":\"Miss\",\"survived\":0,\"count\":85},{\"title\":\"Miss\",\"survived\":1,\"count\":179},{\"title\":\"Mr\",\"survived\":0,\"count\":634},{\"title\":\"Mr\",\"survived\":1,\"count\":123},{\"title\":\"Mrs\",\"survived\":0,\"count\":42},{\"title\":\"Mrs\",\"survived\":1,\"count\":156},{\"title\":\"Rare Title\",\"survived\":0,\"count\":18},{\"title\":\"Rare Title\",\"survived\":1,\"count\":11}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 34 } ], "source": [ "val groupedCleanedTitledDF = cleanedTitledDF\n", " .valueCounts { title and survived }\n", " .sortBy { title and survived }\n", "groupedCleanedTitledDF" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Surname's analysis\n", "It's very interesting to dig deeper into families, home destinations, and we could do start this analysis from surnames which could be easily extracted from **Name** feature." ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:34.873592166Z", "start_time": "2023-12-05T11:12:33.476401507Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
survivednamesurname
1Allen, Miss. Elisabeth WaltonAllen
1Allison, Master. Hudson TrevorAllison
0Allison, Miss. Helen LoraineAllison
0Allison, Mr. Hudson Joshua CreightonAllison
0Allison, Mrs. Hudson J C (Bessie Wald...Allison
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":5,\"ncol\":3,\"columns\":[\"survived\",\"name\",\"surname\"],\"kotlin_dataframe\":[{\"survived\":1,\"name\":\"Allen, Miss. Elisabeth Walton\",\"surname\":\"Allen\"},{\"survived\":1,\"name\":\"Allison, Master. Hudson Trevor\",\"surname\":\"Allison\"},{\"survived\":0,\"name\":\"Allison, Miss. Helen Loraine\",\"surname\":\"Allison\"},{\"survived\":0,\"name\":\"Allison, Mr. Hudson Joshua Creighton\",\"surname\":\"Allison\"},{\"survived\":0,\"name\":\"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)\",\"surname\":\"Allison\"}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 35 } ], "source": [ "val surnameDF = df1\n", " .select { survived and name }\n", " .add(\"surname\") {\n", " name.split(\".\")[0].split(\",\")[0].trim()\n", " }\n", "surnameDF.head()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:35.215080332Z", "start_time": "2023-12-05T11:12:34.115103313Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
surnamecount
Andersson11
Sage11
Asplund8
Goodwin8
Davies7
Brown6
Carter6
Fortune6
Smith6
Ford6
Johnson6
Panula6
Rice6
Skoog6
Ryerson5
Williams5
Kelly5
Lefebre5
Palsson5
Thomas5
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":875,\"ncol\":2,\"columns\":[\"surname\",\"count\"],\"kotlin_dataframe\":[{\"surname\":\"Andersson\",\"count\":11},{\"surname\":\"Sage\",\"count\":11},{\"surname\":\"Asplund\",\"count\":8},{\"surname\":\"Goodwin\",\"count\":8},{\"surname\":\"Davies\",\"count\":7},{\"surname\":\"Brown\",\"count\":6},{\"surname\":\"Carter\",\"count\":6},{\"surname\":\"Fortune\",\"count\":6},{\"surname\":\"Smith\",\"count\":6},{\"surname\":\"Ford\",\"count\":6},{\"surname\":\"Johnson\",\"count\":6},{\"surname\":\"Panula\",\"count\":6},{\"surname\":\"Rice\",\"count\":6},{\"surname\":\"Skoog\",\"count\":6},{\"surname\":\"Ryerson\",\"count\":5},{\"surname\":\"Williams\",\"count\":5},{\"surname\":\"Kelly\",\"count\":5},{\"surname\":\"Lefebre\",\"count\":5},{\"surname\":\"Palsson\",\"count\":5},{\"surname\":\"Thomas\",\"count\":5}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 36 } ], "source": [ "surnameDF.valueCounts { surname }" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:35.222249097Z", "start_time": "2023-12-05T11:12:34.230241724Z" } }, "outputs": [ { "data": { "text/plain": [ "875" ] }, "metadata": {}, "output_type": "execute_result", "execution_count": 37 } ], "source": [ "surnameDF.surname.countDistinct()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2023-12-05T11:12:35.826748466Z", "start_time": "2023-12-05T11:12:34.300627752Z" } }, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", "
firstSymbolsurvived
A-0.017914
B0.050554
C0.009037
D0.051711
E-0.034629
F0.000400
G-0.044483
H0.042187
I-0.008329
J-0.026790
K-0.014219
L-0.021071
M0.019041
N0.028698
O0.000128
P-0.058996
R-0.021941
S-0.020043
T0.052264
U-0.021737
\n", " \n", " \n", " " ], "application/kotlindataframe+json": "{\"nrow\":27,\"ncol\":2,\"columns\":[\"firstSymbol\",\"survived\"],\"kotlin_dataframe\":[{\"firstSymbol\":\"A\",\"survived\":-0.01791352622509756},{\"firstSymbol\":\"B\",\"survived\":0.050553943254341316},{\"firstSymbol\":\"C\",\"survived\":0.009037371118975828},{\"firstSymbol\":\"D\",\"survived\":0.05171064357839075},{\"firstSymbol\":\"E\",\"survived\":-0.03462861880002389},{\"firstSymbol\":\"F\",\"survived\":4.002707178710613E-4},{\"firstSymbol\":\"G\",\"survived\":-0.044483069276203296},{\"firstSymbol\":\"H\",\"survived\":0.04218724210575963},{\"firstSymbol\":\"I\",\"survived\":-0.008329183461658642},{\"firstSymbol\":\"J\",\"survived\":-0.026790134975567197},{\"firstSymbol\":\"K\",\"survived\":-0.014218719831367379},{\"firstSymbol\":\"L\",\"survived\":-0.021070982850893608},{\"firstSymbol\":\"M\",\"survived\":0.019040748971095155},{\"firstSymbol\":\"N\",\"survived\":0.02869766040895532},{\"firstSymbol\":\"O\",\"survived\":1.2837933079428644E-4},{\"firstSymbol\":\"P\",\"survived\":-0.05899571294487214},{\"firstSymbol\":\"R\",\"survived\":-0.021940537137738973},{\"firstSymbol\":\"S\",\"survived\":-0.020042889901563117},{\"firstSymbol\":\"T\",\"survived\":0.05226413426840335},{\"firstSymbol\":\"U\",\"survived\":-0.0217373635753353}]}" }, "metadata": {}, "output_type": "execute_result", "execution_count": 38 } ], "source": [ "val firstSymbol by column()\n", "\n", "df1\n", " .add(firstSymbol) {\n", " name.split(\".\")[0].split(\",\")[0].trim().first().toString()\n", " }\n", " .pivotMatches(firstSymbol)\n", " .corr { firstSymbol }.with { survived }\n" ] } ], "metadata": { "kernelspec": { "display_name": "Kotlin", "language": "kotlin", "name": "kotlin" }, "language_info": { "codemirror_mode": "text/x-kotlin", "file_extension": ".kt", "mimetype": "text/x-kotlin", "name": "kotlin", "nbconvert_exporter": "", "pygments_lexer": "kotlin", "version": "1.8.20-Beta" }, "ktnbPluginMetadata": { "projectLibraries": [] } }, "nbformat": 4, "nbformat_minor": 1 }