{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## NA and Outlier analysis" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading required package: daltoolbox\n", "\n", "Registered S3 method overwritten by 'quantmod':\n", " method from\n", " as.zoo.data.frame zoo \n", "\n", "\n", "Attaching package: ‘daltoolbox’\n", "\n", "\n", "The following object is masked from ‘package:base’:\n", "\n", " transform\n", "\n", "\n" ] } ], "source": [ "# DAL ToolBox\n", "# version 1.01.727\n", "\n", "source(\"https://raw.githubusercontent.com/cefet-rj-dal/daltoolbox/main/jupyter.R\")\n", "\n", "#loading DAL\n", "load_library(\"daltoolbox\") " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Outlier removal\n", "The following class uses box-plot definition for outliers.\n", "\n", "An outlier is a value that is below than $Q_1 - 1.5 \\cdot IQR$ or higher than $Q_3 + 1.5 \\cdot IQR$.\n", "\n", "The class remove outliers for numeric attributes. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### removing outliers of a data frame" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
A data.frame: 6 × 5
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
<dbl><dbl><dbl><dbl><fct>
15.13.51.40.2setosa
24.93.01.40.2setosa
34.73.21.30.2setosa
44.63.11.50.2setosa
55.03.61.40.2setosa
65.43.91.70.4setosa
\n" ], "text/latex": [ "A data.frame: 6 × 5\n", "\\begin{tabular}{r|lllll}\n", " & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n", " & & & & & \\\\\n", "\\hline\n", "\t1 & 5.1 & 3.5 & 1.4 & 0.2 & setosa\\\\\n", "\t2 & 4.9 & 3.0 & 1.4 & 0.2 & setosa\\\\\n", "\t3 & 4.7 & 3.2 & 1.3 & 0.2 & setosa\\\\\n", "\t4 & 4.6 & 3.1 & 1.5 & 0.2 & setosa\\\\\n", "\t5 & 5.0 & 3.6 & 1.4 & 0.2 & setosa\\\\\n", "\t6 & 5.4 & 3.9 & 1.7 & 0.4 & setosa\\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "\n", "A data.frame: 6 × 5\n", "\n", "| | Sepal.Length <dbl> | Sepal.Width <dbl> | Petal.Length <dbl> | Petal.Width <dbl> | Species <fct> |\n", "|---|---|---|---|---|---|\n", "| 1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |\n", "| 2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |\n", "| 3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |\n", "| 4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |\n", "| 5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |\n", "| 6 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |\n", "\n" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n", "1 5.1 3.5 1.4 0.2 setosa \n", "2 4.9 3.0 1.4 0.2 setosa \n", "3 4.7 3.2 1.3 0.2 setosa \n", "4 4.6 3.1 1.5 0.2 setosa \n", "5 5.0 3.6 1.4 0.2 setosa \n", "6 5.4 3.9 1.7 0.4 setosa " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "146" ], "text/latex": [ "146" ], "text/markdown": [ "146" ], "text/plain": [ "[1] 146" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# code for outlier removal\n", "out_obj <- outliers() # class for outlier analysis\n", "out_obj <- fit(out_obj, iris) # computing boundaries\n", "iris.clean <- transform(out_obj, iris) # returning cleaned dataset\n", "\n", "# inspection of cleaned dataset\n", "head(iris.clean)\n", "nrow(iris.clean)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Visualizing the actual outliers" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "idx\n", "FALSE TRUE \n", " 146 4 \n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "\n", "\t\n", "\t\n", "\t\n", "\t\n", "\n", "
A data.frame: 4 × 5
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
<dbl><dbl><dbl><dbl><fct>
165.74.41.50.4setosa
335.24.11.50.1setosa
345.54.21.40.2setosa
615.02.03.51.0versicolor
\n" ], "text/latex": [ "A data.frame: 4 × 5\n", "\\begin{tabular}{r|lllll}\n", " & Sepal.Length & Sepal.Width & Petal.Length & Petal.Width & Species\\\\\n", " & & & & & \\\\\n", "\\hline\n", "\t16 & 5.7 & 4.4 & 1.5 & 0.4 & setosa \\\\\n", "\t33 & 5.2 & 4.1 & 1.5 & 0.1 & setosa \\\\\n", "\t34 & 5.5 & 4.2 & 1.4 & 0.2 & setosa \\\\\n", "\t61 & 5.0 & 2.0 & 3.5 & 1.0 & versicolor\\\\\n", "\\end{tabular}\n" ], "text/markdown": [ "\n", "A data.frame: 4 × 5\n", "\n", "| | Sepal.Length <dbl> | Sepal.Width <dbl> | Petal.Length <dbl> | Petal.Width <dbl> | Species <fct> |\n", "|---|---|---|---|---|---|\n", "| 16 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |\n", "| 33 | 5.2 | 4.1 | 1.5 | 0.1 | setosa |\n", "| 34 | 5.5 | 4.2 | 1.4 | 0.2 | setosa |\n", "| 61 | 5.0 | 2.0 | 3.5 | 1.0 | versicolor |\n", "\n" ], "text/plain": [ " Sepal.Length Sepal.Width Petal.Length Petal.Width Species \n", "16 5.7 4.4 1.5 0.4 setosa \n", "33 5.2 4.1 1.5 0.1 setosa \n", "34 5.5 4.2 1.4 0.2 setosa \n", "61 5.0 2.0 3.5 1.0 versicolor" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "idx <- attr(iris.clean, \"idx\")\n", "print(table(idx))\n", "iris.outliers <- iris[idx,]\n", "head(iris.outliers)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "4.3.3" } }, "nbformat": 4, "nbformat_minor": 4 }