{
"cells": [
{
"cell_type": "code",
"metadata": {
"ExecuteTime": {
"end_time": "2024-08-21T16:32:46.474583Z",
"start_time": "2024-08-21T16:32:39.480718Z"
}
},
"source": "%use krangl@2fcf74dfbbe382f1803d1ab9e4739439e1f5671b",
"outputs": [],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
| first_name | last_name | age | weight |
|---|
| Max | Doe | 23 | 55 |
| Franz | Smith | 23 | 88 |
| Horst | Keanes | 12 | 82 |
"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// in memory\n",
"val df: DataFrame = dataFrameOf(\n",
" \"first_name\", \"last_name\", \"age\", \"weight\")(\n",
" \"Max\", \"Doe\", 23, 55,\n",
" \"Franz\", \"Smith\", 23, 88,\n",
" \"Horst\", \"Keanes\", 12, 82\n",
")\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add columns"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight | salary_category |
|---|
| Max | Doe | 23 | 55 | 3 |
| Franz | Smith | 23 | 88 | 3 |
| Horst | Keanes | 12 | 82 | 3 |
"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.addColumn(\"salary_category\") { 3 }"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight | age_3y_later |
|---|
| Max | Doe | 23 | 55 | 26 |
| Franz | Smith | 23 | 88 | 26 |
| Horst | Keanes | 12 | 82 | 15 |
"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.addColumn(\"age_3y_later\") { it[\"age\"] + 3 }"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight | full_name |
|---|
| Max | Doe | 23 | 55 | Max Doe |
| Franz | Smith | 23 | 88 | Franz Smith |
| Horst | Keanes | 12 | 82 | Horst Keanes |
"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Note: krangl dataframes are immutable so we need to (re)assign results to preserve changes.\n",
"val newDF = df.addColumn(\"full_name\") { it[\"first_name\"] + \" \" + it[\"last_name\"] }\n",
"newDF"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight | user_id |
|---|
| Max | Doe | 23 | 55 | Doe_id1 |
| Franz | Smith | 23 | 88 | Smith_id2 |
| Horst | Keanes | 12 | 82 | Keanes_id3 |
"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Also feel free to mix types here since krangl overloads arithmetic operators like + for dataframe-columns\n",
"df.addColumn(\"user_id\") { it[\"last_name\"] + \"_id\" + rowNumber }"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight | with_anz |
|---|
| Max | Doe | 23 | 55 | false |
| Franz | Smith | 23 | 88 | true |
| Horst | Keanes | 12 | 82 | false |
"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Create new attributes with string operations like matching, splitting or extraction.\n",
"df.addColumn(\"with_anz\") { it[\"first_name\"].asStrings().map { it!!.contains(\"anz\") } }"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight | first_name_initial |
|---|
| Max | Doe | 23 | 55 | M |
| Franz | Smith | 23 | 88 | F |
| Horst | Keanes | 12 | 82 | H |
"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Note: krangl is using 'null' as missing value, and provides convenience methods to process non-NA bits\n",
"df.addColumn(\"first_name_initial\") { it[\"first_name\"].map{ it.first() } }"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight | age_plus3 | initials |
|---|
| Max | Doe | 23 | 55 | 26 | M D |
| Franz | Smith | 23 | 88 | 26 | F S |
| Horst | Keanes | 12 | 82 | 15 | H K |
"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n",
"\n",
"// or add multiple columns at once\n",
"df.addColumns(\n",
" \"age_plus3\" to { it[\"age\"] + 3 },\n",
" \"initials\" to { it[\"first_name\"].map { it.first() } concat it[\"last_name\"].map { it.first() } }\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sort"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Horst | Keanes | 12 | 82 |
| Max | Doe | 23 | 55 |
| Franz | Smith | 23 | 88 |
"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Sort your data with sortedBy\n",
"df.sortedBy(\"age\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Horst | Keanes | 12 | 82 |
| Max | Doe | 23 | 55 |
| Franz | Smith | 23 | 88 |
"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// and add secondary sorting attributes as varargs\n",
"df.sortedBy(\"age\", \"weight\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Franz | Smith | 23 | 88 |
| Max | Doe | 23 | 55 |
| Horst | Keanes | 12 | 82 |
"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sortedByDescending(\"age\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Max | Doe | 23 | 55 |
| Horst | Keanes | 12 | 82 |
| Franz | Smith | 23 | 88 |
"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sortedBy { it[\"weight\"].asInts() }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Select"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Subset columns with select\n",
"df.selectIf { it is IntCol } // functional style column selection"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| last_name | weight |
|---|
| Doe | 55 |
| Smith | 88 |
| Keanes | 82 |
"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.select(\"last_name\", \"weight\") // positive selection"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name |
|---|
| Max | Doe |
| Franz | Smith |
| Horst | Keanes |
"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.remove(\"weight\", \"age\") // negative selection"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name |
|---|
| Max | Doe |
| Franz | Smith |
| Horst | Keanes |
"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.select({ endsWith(\"name\") }) // selector mini-language"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Max | Doe | 23 | 55 |
| Franz | Smith | 23 | 88 |
"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Subset rows with vectorized filter\n",
"df.filter { it[\"age\"] eq 23 }"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Franz | Smith | 23 | 88 |
| Horst | Keanes | 12 | 82 |
"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.filter { it[\"weight\"] gt 60 }"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Max | Doe | 23 | 55 |
"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.filter({ it[\"last_name\"].isMatching { startsWith(\"Do\") }})"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Max | Doe | 23 | 55 |
| Franz | Smith | 23 | 88 |
"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// In case vectorized operations are not possible or available we can also filter tables by row\n",
"// which allows for scalar operators\n",
"df.filterByRow { it[\"age\"] as Int > 20 }"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Horst | Keanes | 12 | 82 |
"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.filterByRow { (it[\"age\"] as Int).rem(10) == 2 }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Summarize"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| age | last_name | n |
|---|
| 23 | Doe | 1 |
| 23 | Smith | 1 |
| 12 | Keanes | 1 |
"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// do simple cross tabulations\n",
"df.count(\"age\", \"last_name\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| mean_age |
|---|
| 19.333333333333332 |
"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n",
"\n",
"// ... or calculate single summary statistic\n",
"df.summarize(\"mean_age\" to { it[\"age\"].mean(true) })"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n",
"\n",
"// ... or multiple summary statistics\n",
"df.summarize(\n",
" \"min_age\" to { it[\"age\"].min() },\n",
" \"max_age\" to { it[\"age\"].max() }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// for sake of r and python adoptability you can also use `=` here\n",
"df.summarize(\n",
" \"min_age\" `=` { it[\"age\"].min() },\n",
" \"max_age\" `=` { it[\"age\"].max() }\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Group"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| age | mean_weight | num_persons |
|---|
| 23 | 71.5 | 2 |
| 12 | 82.0 | 1 |
"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n",
"\n",
"// Grouped operations\n",
"val groupedDf: DataFrame = df.groupBy(\"age\") // or provide multiple grouping attributes with varargs\n",
"\n",
"val sumDF = groupedDf.summarize(\n",
" \"mean_weight\" to { it[\"weight\"].mean(removeNA = true) },\n",
" \"num_persons\" to { nrow }\n",
")\n",
"sumDF"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| first_name | last_name | age | weight |
|---|
| Max | Doe | 23 | 55 |
| Franz | Smith | 23 | 88 |
| Horst | Keanes | 12 | 82 |
"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Optionally ungroup the data\n",
"groupedDf.ungroup()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Person(age=23, mean_weight=71.5, num_persons=2), Person(age=12, mean_weight=82.0, num_persons=1)]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Generate object bindings for kotlin.\n",
"data class Person(val age: Int, val mean_weight: Double, val num_persons: Int)\n",
"val records = sumDF.rowsAs()\n",
"\n",
"records"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"71.5"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Now we can use the krangl result table in a strongly typed way\n",
"records.first().mean_weight"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"| age | mean_weight | num_persons |
|---|
| 23 | 71.5 | 2 |
| 12 | 82.0 | 1 |
"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Vice versa we can also convert an existing set of objects into\n",
"records.asDataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// to populate a data-frame with selected properties only, we can do\n",
"records.deparseRecords { mapOf(\"age\" to it.age, \"weight\" to it.mean_weight) }"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Kotlin",
"language": "kotlin",
"name": "kotlin"
},
"language_info": {
"codemirror_mode": "text/x-kotlin",
"file_extension": "kt",
"name": "kotlin"
}
},
"nbformat": 4,
"nbformat_minor": 2
}