{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Clustering - Kmeans\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading required package: daltoolbox\n", "\n", "Registered S3 method overwritten by 'quantmod':\n", " method from\n", " as.zoo.data.frame zoo \n", "\n", "\n", "Attaching package: ‘daltoolbox’\n", "\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " transform\n", "\n", "\n" ] } ], "source": [ "# DAL ToolBox\n", "# version 1.01.727\n", "\n", "source(\"https://raw.githubusercontent.com/cefet-rj-dal/daltoolbox/main/jupyter.R\")\n", "\n", "#loading DAL\n", "load_library(\"daltoolbox\") " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#load dataset\n", "data(iris)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## General function to test clustering methods" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# setup clustering\n", "model <- cluster_kmeans(k=3)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "clu\n", " 1 2 3 \n", "96 21 33 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# build model\n", "model <- fit(model, iris[,1:4])\n", "clu <- cluster(model, iris[,1:4])\n", "table(clu)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\t
$clusters_entropy
\n", "\t\t
\n", "\n", "\n", "\t\n", "\t\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
A tibble: 3 × 4
xceqtdceg
<fct><dbl><int><dbl>
10.9987473960.63919827
20.7024666210.09834532
30.0000000330.00000000
\n", "
\n", "\t
$clustering_entropy
\n", "\t\t
0.737543587859802
\n", "\t
$data_entropy
\n", "\t\t
1.58496250072116
\n", "
\n" ], "text/latex": [ "\\begin{description}\n", "\\item[\\$clusters\\_entropy] A tibble: 3 × 4\n", "\\begin{tabular}{llll}\n", " x & ce & qtd & ceg\\\\\n", " & & & \\\\\n", "\\hline\n", "\t 1 & 0.9987473 & 96 & 0.63919827\\\\\n", "\t 2 & 0.7024666 & 21 & 0.09834532\\\\\n", "\t 3 & 0.0000000 & 33 & 0.00000000\\\\\n", "\\end{tabular}\n", "\n", "\\item[\\$clustering\\_entropy] 0.737543587859802\n", "\\item[\\$data\\_entropy] 1.58496250072116\n", "\\end{description}\n" ], "text/markdown": [ "$clusters_entropy\n", ": \n", "A tibble: 3 × 4\n", "\n", "| x <fct> | ce <dbl> | qtd <int> | ceg <dbl> |\n", "|---|---|---|---|\n", "| 1 | 0.9987473 | 96 | 0.63919827 |\n", "| 2 | 0.7024666 | 21 | 0.09834532 |\n", "| 3 | 0.0000000 | 33 | 0.00000000 |\n", "\n", "\n", "$clustering_entropy\n", ": 0.737543587859802\n", "$data_entropy\n", ": 1.58496250072116\n", "\n", "\n" ], "text/plain": [ "$clusters_entropy\n", "\u001b[90m# A tibble: 3 × 4\u001b[39m\n", " x ce qtd ceg\n", " \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m\n", "\u001b[90m1\u001b[39m 1 0.999 96 0.639 \n", "\u001b[90m2\u001b[39m 2 0.702 21 0.098\u001b[4m3\u001b[24m\n", "\u001b[90m3\u001b[39m 3 0 33 0 \n", "\n", "$clustering_entropy\n", "[1] 0.7375436\n", "\n", "$data_entropy\n", "[1] 1.584963\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# evaluate model using external metric\n", "eval <- evaluate(model, clu, iris$Species)\n", "eval" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Influence of normalization in clustering" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "clu\n", " 1 2 3 \n", "21 96 33 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "iris_minmax <- transform(fit(minmax(), iris), iris)\n", "model <- fit(model, iris_minmax[,1:4])\n", "clu <- cluster(model, iris_minmax[,1:4])\n", "table(clu)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\t
$clusters_entropy
\n", "\t\t
\n", "\n", "\n", "\t\n", "\t\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\n", "
A tibble: 3 × 4
xceqtdceg
<fct><dbl><int><dbl>
10.7024666210.09834532
20.9987473960.63919827
30.0000000330.00000000
\n", "
\n", "\t
$clustering_entropy
\n", "\t\t
0.737543587859802
\n", "\t
$data_entropy
\n", "\t\t
1.58496250072116
\n", "
\n" ], "text/latex": [ "\\begin{description}\n", "\\item[\\$clusters\\_entropy] A tibble: 3 × 4\n", "\\begin{tabular}{llll}\n", " x & ce & qtd & ceg\\\\\n", " & & & \\\\\n", "\\hline\n", "\t 1 & 0.7024666 & 21 & 0.09834532\\\\\n", "\t 2 & 0.9987473 & 96 & 0.63919827\\\\\n", "\t 3 & 0.0000000 & 33 & 0.00000000\\\\\n", "\\end{tabular}\n", "\n", "\\item[\\$clustering\\_entropy] 0.737543587859802\n", "\\item[\\$data\\_entropy] 1.58496250072116\n", "\\end{description}\n" ], "text/markdown": [ "$clusters_entropy\n", ": \n", "A tibble: 3 × 4\n", "\n", "| x <fct> | ce <dbl> | qtd <int> | ceg <dbl> |\n", "|---|---|---|---|\n", "| 1 | 0.7024666 | 21 | 0.09834532 |\n", "| 2 | 0.9987473 | 96 | 0.63919827 |\n", "| 3 | 0.0000000 | 33 | 0.00000000 |\n", "\n", "\n", "$clustering_entropy\n", ": 0.737543587859802\n", "$data_entropy\n", ": 1.58496250072116\n", "\n", "\n" ], "text/plain": [ "$clusters_entropy\n", "\u001b[90m# A tibble: 3 × 4\u001b[39m\n", " x ce qtd ceg\n", " \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m\n", "\u001b[90m1\u001b[39m 1 0.702 21 0.098\u001b[4m3\u001b[24m\n", "\u001b[90m2\u001b[39m 2 0.999 96 0.639 \n", "\u001b[90m3\u001b[39m 3 0 33 0 \n", "\n", "$clustering_entropy\n", "[1] 0.7375436\n", "\n", "$data_entropy\n", "[1] 1.584963\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# evaluate model using external metric\n", "eval <- evaluate(model, clu, iris_minmax$Species)\n", "eval" ] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "4.3.3" } }, "nbformat": 4, "nbformat_minor": 4 }