{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Clustering - Kmeans\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading required package: daltoolbox\n",
"\n",
"Registered S3 method overwritten by 'quantmod':\n",
" method from\n",
" as.zoo.data.frame zoo \n",
"\n",
"\n",
"Attaching package: ‘daltoolbox’\n",
"\n",
"\n",
"The following object is masked from ‘package:base’:\n",
"\n",
" transform\n",
"\n",
"\n"
]
}
],
"source": [
"# DAL ToolBox\n",
"# version 1.01.727\n",
"\n",
"source(\"https://raw.githubusercontent.com/cefet-rj-dal/daltoolbox/main/jupyter.R\")\n",
"\n",
"#loading DAL\n",
"load_library(\"daltoolbox\") "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#load dataset\n",
"data(iris)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## General function to test clustering methods"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# setup clustering\n",
"model <- cluster_kmeans(k=3)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"clu\n",
" 1 2 3 \n",
"96 21 33 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# build model\n",
"model <- fit(model, iris[,1:4])\n",
"clu <- cluster(model, iris[,1:4])\n",
"table(clu)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\t- $clusters_entropy
\n",
"\t\t\n",
"A tibble: 3 × 4\n",
"\n",
"\tx | ce | qtd | ceg |
\n",
"\t<fct> | <dbl> | <int> | <dbl> |
\n",
"\n",
"\n",
"\t1 | 0.9987473 | 96 | 0.63919827 |
\n",
"\t2 | 0.7024666 | 21 | 0.09834532 |
\n",
"\t3 | 0.0000000 | 33 | 0.00000000 |
\n",
"\n",
"
\n",
" \n",
"\t- $clustering_entropy
\n",
"\t\t- 0.737543587859802
\n",
"\t- $data_entropy
\n",
"\t\t- 1.58496250072116
\n",
"
\n"
],
"text/latex": [
"\\begin{description}\n",
"\\item[\\$clusters\\_entropy] A tibble: 3 × 4\n",
"\\begin{tabular}{llll}\n",
" x & ce & qtd & ceg\\\\\n",
" & & & \\\\\n",
"\\hline\n",
"\t 1 & 0.9987473 & 96 & 0.63919827\\\\\n",
"\t 2 & 0.7024666 & 21 & 0.09834532\\\\\n",
"\t 3 & 0.0000000 & 33 & 0.00000000\\\\\n",
"\\end{tabular}\n",
"\n",
"\\item[\\$clustering\\_entropy] 0.737543587859802\n",
"\\item[\\$data\\_entropy] 1.58496250072116\n",
"\\end{description}\n"
],
"text/markdown": [
"$clusters_entropy\n",
": \n",
"A tibble: 3 × 4\n",
"\n",
"| x <fct> | ce <dbl> | qtd <int> | ceg <dbl> |\n",
"|---|---|---|---|\n",
"| 1 | 0.9987473 | 96 | 0.63919827 |\n",
"| 2 | 0.7024666 | 21 | 0.09834532 |\n",
"| 3 | 0.0000000 | 33 | 0.00000000 |\n",
"\n",
"\n",
"$clustering_entropy\n",
": 0.737543587859802\n",
"$data_entropy\n",
": 1.58496250072116\n",
"\n",
"\n"
],
"text/plain": [
"$clusters_entropy\n",
"\u001b[90m# A tibble: 3 × 4\u001b[39m\n",
" x ce qtd ceg\n",
" \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m\n",
"\u001b[90m1\u001b[39m 1 0.999 96 0.639 \n",
"\u001b[90m2\u001b[39m 2 0.702 21 0.098\u001b[4m3\u001b[24m\n",
"\u001b[90m3\u001b[39m 3 0 33 0 \n",
"\n",
"$clustering_entropy\n",
"[1] 0.7375436\n",
"\n",
"$data_entropy\n",
"[1] 1.584963\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# evaluate model using external metric\n",
"eval <- evaluate(model, clu, iris$Species)\n",
"eval"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Influence of normalization in clustering"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"clu\n",
" 1 2 3 \n",
"21 96 33 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"iris_minmax <- transform(fit(minmax(), iris), iris)\n",
"model <- fit(model, iris_minmax[,1:4])\n",
"clu <- cluster(model, iris_minmax[,1:4])\n",
"table(clu)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\t- $clusters_entropy
\n",
"\t\t\n",
"A tibble: 3 × 4\n",
"\n",
"\tx | ce | qtd | ceg |
\n",
"\t<fct> | <dbl> | <int> | <dbl> |
\n",
"\n",
"\n",
"\t1 | 0.7024666 | 21 | 0.09834532 |
\n",
"\t2 | 0.9987473 | 96 | 0.63919827 |
\n",
"\t3 | 0.0000000 | 33 | 0.00000000 |
\n",
"\n",
"
\n",
" \n",
"\t- $clustering_entropy
\n",
"\t\t- 0.737543587859802
\n",
"\t- $data_entropy
\n",
"\t\t- 1.58496250072116
\n",
"
\n"
],
"text/latex": [
"\\begin{description}\n",
"\\item[\\$clusters\\_entropy] A tibble: 3 × 4\n",
"\\begin{tabular}{llll}\n",
" x & ce & qtd & ceg\\\\\n",
" & & & \\\\\n",
"\\hline\n",
"\t 1 & 0.7024666 & 21 & 0.09834532\\\\\n",
"\t 2 & 0.9987473 & 96 & 0.63919827\\\\\n",
"\t 3 & 0.0000000 & 33 & 0.00000000\\\\\n",
"\\end{tabular}\n",
"\n",
"\\item[\\$clustering\\_entropy] 0.737543587859802\n",
"\\item[\\$data\\_entropy] 1.58496250072116\n",
"\\end{description}\n"
],
"text/markdown": [
"$clusters_entropy\n",
": \n",
"A tibble: 3 × 4\n",
"\n",
"| x <fct> | ce <dbl> | qtd <int> | ceg <dbl> |\n",
"|---|---|---|---|\n",
"| 1 | 0.7024666 | 21 | 0.09834532 |\n",
"| 2 | 0.9987473 | 96 | 0.63919827 |\n",
"| 3 | 0.0000000 | 33 | 0.00000000 |\n",
"\n",
"\n",
"$clustering_entropy\n",
": 0.737543587859802\n",
"$data_entropy\n",
": 1.58496250072116\n",
"\n",
"\n"
],
"text/plain": [
"$clusters_entropy\n",
"\u001b[90m# A tibble: 3 × 4\u001b[39m\n",
" x ce qtd ceg\n",
" \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m\n",
"\u001b[90m1\u001b[39m 1 0.702 21 0.098\u001b[4m3\u001b[24m\n",
"\u001b[90m2\u001b[39m 2 0.999 96 0.639 \n",
"\u001b[90m3\u001b[39m 3 0 33 0 \n",
"\n",
"$clustering_entropy\n",
"[1] 0.7375436\n",
"\n",
"$data_entropy\n",
"[1] 1.584963\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# evaluate model using external metric\n",
"eval <- evaluate(model, clu, iris_minmax$Species)\n",
"eval"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "4.3.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}