{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Ejercicio Python de K-means\n", "Realizaremos un ejercicio de prueba para comprender como funciona este algoritmo" ] }, { "cell_type": "code", "execution_count": 232, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sb\n", "from sklearn.cluster import KMeans\n", "from sklearn.metrics import pairwise_distances_argmin_min\n", "\n", "%matplotlib inline\n", "from mpl_toolkits.mplot3d import Axes3D\n", "plt.rcParams['figure.figsize'] = (16, 9)\n", "plt.style.use('ggplot')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cargamos los datos de entrada del archivo csv" ] }, { "cell_type": "code", "execution_count": 233, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
usuarioopcoexagnewordcountcategoria
03gerardpique34.29795328.14881941.94881929.3703159.84157537.09457
1aguerosergiokun44.98684220.52586537.93894724.27909810.36240678.79707
2albertochicote41.73385413.74541738.99989634.6455218.83697949.26044
3AlejandroSanz40.37715415.37746252.33753831.0821545.03223180.45382
4alfredocasero136.66467719.64225848.53080631.1388717.30596847.06454
\n", "
" ], "text/plain": [ " usuario op co ex ag ne \\\n", "0 3gerardpique 34.297953 28.148819 41.948819 29.370315 9.841575 \n", "1 aguerosergiokun 44.986842 20.525865 37.938947 24.279098 10.362406 \n", "2 albertochicote 41.733854 13.745417 38.999896 34.645521 8.836979 \n", "3 AlejandroSanz 40.377154 15.377462 52.337538 31.082154 5.032231 \n", "4 alfredocasero1 36.664677 19.642258 48.530806 31.138871 7.305968 \n", "\n", " wordcount categoria \n", "0 37.0945 7 \n", "1 78.7970 7 \n", "2 49.2604 4 \n", "3 80.4538 2 \n", "4 47.0645 4 " ] }, "execution_count": 233, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataframe = pd.read_csv(r\"analisis.csv\")\n", "dataframe.head()" ] }, { "cell_type": "code", "execution_count": 234, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
opcoexagnewordcountcategoria
count140.000000140.000000140.000000140.000000140.000000140.000000140.000000
mean44.41459122.97713540.76442822.9185288.00009898.7154844.050000
std8.4257235.8168517.1852467.6571223.03924844.7140712.658839
min30.0204657.85275618.6935429.3059851.0302135.0208001.000000
25%38.20648419.74029936.09572217.0509936.08614466.2184752.000000
50%44.50709122.46671841.45749221.3845547.83972294.7114003.500000
75%49.36592326.09160645.19776928.6788679.758189119.7079257.000000
max71.69612949.63786359.82484440.58316223.978462217.1832009.000000
\n", "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0red42
1green33
2blue16
3cyan27
4yellow22
\n", "
" ], "text/plain": [ " color cantidad\n", "0 red 42\n", "1 green 33\n", "2 blue 16\n", "3 cyan 27\n", "4 yellow 22" ] }, "execution_count": 247, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# contamos cuantos usuarios hay en cada grupo\n", "copy = pd.DataFrame()\n", "copy['usuario']=dataframe['usuario'].values\n", "copy['categoria']=dataframe['categoria'].values\n", "copy['label'] = labels;\n", "cantidadGrupo = pd.DataFrame()\n", "cantidadGrupo['color']=colores\n", "cantidadGrupo['cantidad']=copy.groupby('label').size()\n", "cantidadGrupo" ] }, { "cell_type": "code", "execution_count": 271, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "