{ "cells": [ { "cell_type": "code", "metadata": { "ExecuteTime": { "end_time": "2024-08-21T16:32:46.474583Z", "start_time": "2024-08-21T16:32:39.480718Z" } }, "source": "%use krangl@2fcf74dfbbe382f1803d1ab9e4739439e1f5671b", "outputs": [], "execution_count": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Max	Doe	23	55
Franz	Smith	23	88
Horst	Keanes	12	82

" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// in memory\n", "val df: DataFrame = dataFrameOf(\n", " \"first_name\", \"last_name\", \"age\", \"weight\")(\n", " \"Max\", \"Doe\", 23, 55,\n", " \"Franz\", \"Smith\", 23, 88,\n", " \"Horst\", \"Keanes\", 12, 82\n", ")\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add columns" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight	salary_category
Max	Doe	23	55	3
Franz	Smith	23	88	3
Horst	Keanes	12	82	3

" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.addColumn(\"salary_category\") { 3 }" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight	age_3y_later
Max	Doe	23	55	26
Franz	Smith	23	88	26
Horst	Keanes	12	82	15

" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.addColumn(\"age_3y_later\") { it[\"age\"] + 3 }" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight	full_name
Max	Doe	23	55	Max Doe
Franz	Smith	23	88	Franz Smith
Horst	Keanes	12	82	Horst Keanes

" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Note: krangl dataframes are immutable so we need to (re)assign results to preserve changes.\n", "val newDF = df.addColumn(\"full_name\") { it[\"first_name\"] + \" \" + it[\"last_name\"] }\n", "newDF" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight	user_id
Max	Doe	23	55	Doe_id1
Franz	Smith	23	88	Smith_id2
Horst	Keanes	12	82	Keanes_id3

" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Also feel free to mix types here since krangl overloads arithmetic operators like + for dataframe-columns\n", "df.addColumn(\"user_id\") { it[\"last_name\"] + \"_id\" + rowNumber }" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight	with_anz
Max	Doe	23	55	false
Franz	Smith	23	88	true
Horst	Keanes	12	82	false

" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Create new attributes with string operations like matching, splitting or extraction.\n", "df.addColumn(\"with_anz\") { it[\"first_name\"].asStrings().map { it!!.contains(\"anz\") } }" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight	first_name_initial
Max	Doe	23	55	M
Franz	Smith	23	88	F
Horst	Keanes	12	82	H

" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Note: krangl is using 'null' as missing value, and provides convenience methods to process non-NA bits\n", "df.addColumn(\"first_name_initial\") { it[\"first_name\"].map{ it.first() } }" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight	age_plus3	initials
Max	Doe	23	55	26	M D
Franz	Smith	23	88	26	F S
Horst	Keanes	12	82	15	H K

" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n", "\n", "// or add multiple columns at once\n", "df.addColumns(\n", " \"age_plus3\" to { it[\"age\"] + 3 },\n", " \"initials\" to { it[\"first_name\"].map { it.first() } concat it[\"last_name\"].map { it.first() } }\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Horst	Keanes	12	82
Max	Doe	23	55
Franz	Smith	23	88

" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Sort your data with sortedBy\n", "df.sortedBy(\"age\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Horst	Keanes	12	82
Max	Doe	23	55
Franz	Smith	23	88

" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// and add secondary sorting attributes as varargs\n", "df.sortedBy(\"age\", \"weight\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Franz	Smith	23	88
Max	Doe	23	55
Horst	Keanes	12	82

" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sortedByDescending(\"age\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Max	Doe	23	55
Horst	Keanes	12	82
Franz	Smith	23	88

" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sortedBy { it[\"weight\"].asInts() }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

age	weight
23	55
23	88
12	82

" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Subset columns with select\n", "df.selectIf { it is IntCol } // functional style column selection" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

last_name	weight
Doe	55
Smith	88
Keanes	82

" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.select(\"last_name\", \"weight\") // positive selection" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name
Max	Doe
Franz	Smith
Horst	Keanes

" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.remove(\"weight\", \"age\") // negative selection" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name
Max	Doe
Franz	Smith
Horst	Keanes

" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.select({ endsWith(\"name\") }) // selector mini-language" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Max	Doe	23	55
Franz	Smith	23	88

" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Subset rows with vectorized filter\n", "df.filter { it[\"age\"] eq 23 }" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Franz	Smith	23	88
Horst	Keanes	12	82

" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.filter { it[\"weight\"] gt 60 }" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Max	Doe	23	55

" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.filter({ it[\"last_name\"].isMatching { startsWith(\"Do\") }})" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Max	Doe	23	55
Franz	Smith	23	88

" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// In case vectorized operations are not possible or available we can also filter tables by row\n", "// which allows for scalar operators\n", "df.filterByRow { it[\"age\"] as Int > 20 }" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Horst	Keanes	12	82

" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.filterByRow { (it[\"age\"] as Int).rem(10) == 2 }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summarize" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

age	last_name	n
23	Doe	1
23	Smith	1
12	Keanes	1

" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// do simple cross tabulations\n", "df.count(\"age\", \"last_name\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

mean_age
19.333333333333332

" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n", "\n", "// ... or calculate single summary statistic\n", "df.summarize(\"mean_age\" to { it[\"age\"].mean(true) })" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

min_age	max_age
12.0	23.0

" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n", "\n", "// ... or multiple summary statistics\n", "df.summarize(\n", " \"min_age\" to { it[\"age\"].min() },\n", " \"max_age\" to { it[\"age\"].max() }\n", ")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

min_age	max_age
12.0	23.0

" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// for sake of r and python adoptability you can also use `=` here\n", "df.summarize(\n", " \"min_age\" `=` { it[\"age\"].min() },\n", " \"max_age\" `=` { it[\"age\"].max() }\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Group" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

age	mean_weight	num_persons
23	71.5	2
12	82.0	1

" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n", "\n", "// Grouped operations\n", "val groupedDf: DataFrame = df.groupBy(\"age\") // or provide multiple grouping attributes with varargs\n", "\n", "val sumDF = groupedDf.summarize(\n", " \"mean_weight\" to { it[\"weight\"].mean(removeNA = true) },\n", " \"num_persons\" to { nrow }\n", ")\n", "sumDF" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

first_name	last_name	age	weight
Max	Doe	23	55
Franz	Smith	23	88
Horst	Keanes	12	82

" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Optionally ungroup the data\n", "groupedDf.ungroup()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[Person(age=23, mean_weight=71.5, num_persons=2), Person(age=12, mean_weight=82.0, num_persons=1)]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Generate object bindings for kotlin.\n", "data class Person(val age: Int, val mean_weight: Double, val num_persons: Int)\n", "val records = sumDF.rowsAs()\n", "\n", "records" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "71.5" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Now we can use the krangl result table in a strongly typed way\n", "records.first().mean_weight" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

age	mean_weight	num_persons
23	71.5	2
12	82.0	1

" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// Vice versa we can also convert an existing set of objects into\n", "records.asDataFrame()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

age	weight
23	71.5
12	82.0

" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "// to populate a data-frame with selected properties only, we can do\n", "records.deparseRecords { mapOf(\"age\" to it.age, \"weight\" to it.mean_weight) }" ] } ], "metadata": { "kernelspec": { "display_name": "Kotlin", "language": "kotlin", "name": "kotlin" }, "language_info": { "codemirror_mode": "text/x-kotlin", "file_extension": "kt", "name": "kotlin" } }, "nbformat": 4, "nbformat_minor": 2 }