{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:43:59.510935", "start_time": "2017-11-17T21:43:56.710238" }, "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", " from pandas.core import datetools\n", "/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", " \"This module will be removed in 0.20.\", DeprecationWarning)\n" ] }, { "data": { "text/html": [ " \n", "\n", "\n", " \n", "\n", "
\n", " \n", " BokehJS successfully loaded.\n", "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Custom libraries\n", "from datascienceutils import clusteringModels as cm\n", "from datascienceutils import analyze\n", "\n", "# Standard libraries\n", "import json\n", "%matplotlib inline\n", "import datetime\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource\n", "from bokeh.charts import Histogram\n", "import bokeh\n", "output_notebook()\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:43:59.551395", "start_time": "2017-11-17T21:43:59.513704" }, "scrolled": true }, "outputs": [], "source": [ "irisDf = pd.read_csv('~/DataScientist/data/Iris.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:44:02.694293", "start_time": "2017-11-17T21:43:59.553380" }, "scrolled": false }, "outputs": [], "source": [ "analyze.dimension_analyze(irisDf, cluster=True, n_clusters=3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:44:03.125544", "start_time": "2017-11-17T21:44:02.696365" }, "scrolled": false }, "outputs": [], "source": [ "analyze.dimension_analyze(irisDf, pca_plot=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:44:03.144265", "start_time": "2017-11-17T21:44:03.127536" }, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmClass
05.13.51.40.2Iris-setosa
14.93.01.40.2Iris-setosa
24.73.21.30.2Iris-setosa
34.63.11.50.2Iris-setosa
45.03.61.40.2Iris-setosa
\n", "
" ], "text/plain": [ " SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Class\n", "0 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5.0 3.6 1.4 0.2 Iris-setosa" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "irisDf.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:44:35.907724", "start_time": "2017-11-17T21:44:35.068234" }, "scrolled": false }, "outputs": [], "source": [ "target = irisDf.Class\n", "irisDf.drop('Class', 1, inplace=True)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cm.cluster_analyze(irisDf)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:44:37.207660", "start_time": "2017-11-17T21:44:35.909239" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 2 The average silhouette_score is : 0.363119994814\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 4 The average silhouette_score is : 0.753523569925\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 6 The average silhouette_score is : 0.622085988116\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "cm.silhouette_analyze(irisDf, cluster_type='KMeans')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:44:37.242001", "start_time": "2017-11-17T21:44:37.209329" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 2 The average silhouette_score is : 0.485842354601\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 4 The average silhouette_score is : 0.607665314687\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 6 The average silhouette_score is : 0.607665314687\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "cm.silhouette_analyze(irisDf, cluster_type='dbscan')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:45:22.086017", "start_time": "2017-11-17T21:45:20.096973" }, "scrolled": false }, "outputs": [], "source": [ "cm.silhouette_analyze(irisDf, cluster_type='spectral')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:45:28.331079", "start_time": "2017-11-17T21:45:27.437544" }, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 2 The average silhouette_score is : 0.501699257107\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 4 The average silhouette_score is : 0.57351529012\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "For clusters = 6 The average silhouette_score is : 0.659228815789\n" ] }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "cm.silhouette_analyze(irisDf, cluster_type='birch')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2017-11-17T21:45:37.071060", "start_time": "2017-11-17T21:45:36.537628" }, "scrolled": true }, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'ipdb'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msom_analyze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mirisDf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malgo_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'som'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/playspace/data-science-utils/datascienceutils/clusteringModels.py\u001b[0m in \u001b[0;36msom_analyze\u001b[0;34m(dataframe, mapsize, algo_type)\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msom_analyze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataframe\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmapsize\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malgo_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'som'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0msompy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 166\u001b[0m \u001b[0msom_factory\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msompy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSOMFactory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/playspace/data-science-utils/src/sompy/sompy/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0msompy\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSOMFactory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mvisualization\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/playspace/data-science-utils/src/sompy/sompy/sompy.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;31m#lbugnon\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0msompy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mipdb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'ipdb'" ] } ], "source": [ "cm.som_analyze(irisDf, (10,10), algo_type='som')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 1 }