{ "cells": [ { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2024-08-21T16:32:46.474583Z", "start_time": "2024-08-21T16:32:39.480718Z" } }, "source": "%use krangl@2fcf74dfbbe382f1803d1ab9e4739439e1f5671b", "outputs": [], "execution_count": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
MaxDoe2355
FranzSmith2388
HorstKeanes1282
" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// in memory\n", "val df: DataFrame = dataFrameOf(\n", " \"first_name\", \"last_name\", \"age\", \"weight\")(\n", " \"Max\", \"Doe\", 23, 55,\n", " \"Franz\", \"Smith\", 23, 88,\n", " \"Horst\", \"Keanes\", 12, 82\n", ")\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add columns" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweightsalary_category
MaxDoe23553
FranzSmith23883
HorstKeanes12823
" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.addColumn(\"salary_category\") { 3 }" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweightage_3y_later
MaxDoe235526
FranzSmith238826
HorstKeanes128215
" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.addColumn(\"age_3y_later\") { it[\"age\"] + 3 }" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweightfull_name
MaxDoe2355Max Doe
FranzSmith2388Franz Smith
HorstKeanes1282Horst Keanes
" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Note: krangl dataframes are immutable so we need to (re)assign results to preserve changes.\n", "val newDF = df.addColumn(\"full_name\") { it[\"first_name\"] + \" \" + it[\"last_name\"] }\n", "newDF" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweightuser_id
MaxDoe2355Doe_id1
FranzSmith2388Smith_id2
HorstKeanes1282Keanes_id3
" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Also feel free to mix types here since krangl overloads arithmetic operators like + for dataframe-columns\n", "df.addColumn(\"user_id\") { it[\"last_name\"] + \"_id\" + rowNumber }" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweightwith_anz
MaxDoe2355false
FranzSmith2388true
HorstKeanes1282false
" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Create new attributes with string operations like matching, splitting or extraction.\n", "df.addColumn(\"with_anz\") { it[\"first_name\"].asStrings().map { it!!.contains(\"anz\") } }" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweightfirst_name_initial
MaxDoe2355M
FranzSmith2388F
HorstKeanes1282H
" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Note: krangl is using 'null' as missing value, and provides convenience methods to process non-NA bits\n", "df.addColumn(\"first_name_initial\") { it[\"first_name\"].map{ it.first() } }" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweightage_plus3initials
MaxDoe235526M D
FranzSmith238826F S
HorstKeanes128215H K
" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n", "\n", "// or add multiple columns at once\n", "df.addColumns(\n", " \"age_plus3\" to { it[\"age\"] + 3 },\n", " \"initials\" to { it[\"first_name\"].map { it.first() } concat it[\"last_name\"].map { it.first() } }\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
HorstKeanes1282
MaxDoe2355
FranzSmith2388
" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Sort your data with sortedBy\n", "df.sortedBy(\"age\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
HorstKeanes1282
MaxDoe2355
FranzSmith2388
" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// and add secondary sorting attributes as varargs\n", "df.sortedBy(\"age\", \"weight\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
FranzSmith2388
MaxDoe2355
HorstKeanes1282
" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sortedByDescending(\"age\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
MaxDoe2355
HorstKeanes1282
FranzSmith2388
" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sortedBy { it[\"weight\"].asInts() }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ageweight
2355
2388
1282
" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Subset columns with select\n", "df.selectIf { it is IntCol } // functional style column selection" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
last_nameweight
Doe55
Smith88
Keanes82
" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.select(\"last_name\", \"weight\") // positive selection" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_name
MaxDoe
FranzSmith
HorstKeanes
" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.remove(\"weight\", \"age\") // negative selection" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_name
MaxDoe
FranzSmith
HorstKeanes
" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.select({ endsWith(\"name\") }) // selector mini-language" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
MaxDoe2355
FranzSmith2388
" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Subset rows with vectorized filter\n", "df.filter { it[\"age\"] eq 23 }" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
FranzSmith2388
HorstKeanes1282
" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.filter { it[\"weight\"] gt 60 }" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
MaxDoe2355
" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.filter({ it[\"last_name\"].isMatching { startsWith(\"Do\") }})" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
MaxDoe2355
FranzSmith2388
" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// In case vectorized operations are not possible or available we can also filter tables by row\n", "// which allows for scalar operators\n", "df.filterByRow { it[\"age\"] as Int > 20 }" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
HorstKeanes1282
" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.filterByRow { (it[\"age\"] as Int).rem(10) == 2 }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summarize" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
agelast_namen
23Doe1
23Smith1
12Keanes1
" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// do simple cross tabulations\n", "df.count(\"age\", \"last_name\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
mean_age
19.333333333333332
" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n", "\n", "// ... or calculate single summary statistic\n", "df.summarize(\"mean_age\" to { it[\"age\"].mean(true) })" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
min_agemax_age
12.023.0
" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n", "\n", "// ... or multiple summary statistics\n", "df.summarize(\n", " \"min_age\" to { it[\"age\"].min() },\n", " \"max_age\" to { it[\"age\"].max() }\n", ")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
min_agemax_age
12.023.0
" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// for sake of r and python adoptability you can also use `=` here\n", "df.summarize(\n", " \"min_age\" `=` { it[\"age\"].min() },\n", " \"max_age\" `=` { it[\"age\"].max() }\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Group" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
agemean_weightnum_persons
2371.52
1282.01
" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n", "\n", "// Grouped operations\n", "val groupedDf: DataFrame = df.groupBy(\"age\") // or provide multiple grouping attributes with varargs\n", "\n", "val sumDF = groupedDf.summarize(\n", " \"mean_weight\" to { it[\"weight\"].mean(removeNA = true) },\n", " \"num_persons\" to { nrow }\n", ")\n", "sumDF" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
first_namelast_nameageweight
MaxDoe2355
FranzSmith2388
HorstKeanes1282
" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Optionally ungroup the data\n", "groupedDf.ungroup()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Person(age=23, mean_weight=71.5, num_persons=2), Person(age=12, mean_weight=82.0, num_persons=1)]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Generate object bindings for kotlin.\n", "data class Person(val age: Int, val mean_weight: Double, val num_persons: Int)\n", "val records = sumDF.rowsAs()\n", "\n", "records" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "71.5" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Now we can use the krangl result table in a strongly typed way\n", "records.first().mean_weight" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
agemean_weightnum_persons
2371.52
1282.01
" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Vice versa we can also convert an existing set of objects into\n", "records.asDataFrame()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ageweight
2371.5
1282.0
" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// to populate a data-frame with selected properties only, we can do\n", "records.deparseRecords { mapOf(\"age\" to it.age, \"weight\" to it.mean_weight) }" ] } ], "metadata": { "kernelspec": { "display_name": "Kotlin", "language": "kotlin", "name": "kotlin" }, "language_info": { "codemirror_mode": "text/x-kotlin", "file_extension": "kt", "name": "kotlin" } }, "nbformat": 4, "nbformat_minor": 2 }