{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Luokittelu - K-Means Cluster" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Klassinen esimerkki luokittelusta on kurjenmiekkojen (iris) luokittelu kolmeen lajiin (setosa, versicolor, virginica) \n", "terä- (petal) ja verholehtien (sepal) koon mukaan. Seuraavassa kokeilen lajien tunnistamista ilman opetusdataa.\n", "\n", "

K-Means Cluster -menetelmän idea

\n", "\n", "Menetelmän tarkoituksena on löytää datasta K-kappaletta ryhmiä (klustereita, segmenttejä). Ryhmät muodostetaan ryhmäkeskusten ympärille." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "\n", "#Vaikuttaa kaavioiden ulkoasuun:\n", "sns.set()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Esimerkkiaineisto löytyy seaborn-kirjastosta:\n", "iris = sns.load_dataset('iris')\n", "iris.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#Feature-matriisi on iris-data ilman species-muuttujaa:\n", "X = iris.drop('species', axis=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[6.85 , 3.07368421, 5.74210526, 2.07105263],\n", " [5.006 , 3.428 , 1.462 , 0.246 ],\n", " [5.9016129 , 2.7483871 , 4.39354839, 1.43387097]])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Gaussian naive bayes -mallin tuonti:\n", "from sklearn.cluster import KMeans\n", "\n", "#Mallin sovitus:\n", "malli = KMeans(n_clusters=3)\n", "malli.fit(X)\n", "\n", "#Ryhmien keskukset (sepal_length, sepal_width, petal_length, petal_width):\n", "malli.cluster_centers_" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col_0lkm
K
038
150
262
\n", "
" ], "text/plain": [ "col_0 lkm\n", "K \n", "0 38\n", "1 50\n", "2 62" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Ryhmiin kuulumiset:\n", "X['K'] = malli.predict(X)\n", "pd.crosstab(X['K'], 'lkm')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
K012
sepal_lengthcount38.00000050.00000062.000000
mean6.8500005.0060005.901613
std0.4941550.3524900.466410
min6.1000004.3000004.900000
25%6.4250004.8000005.600000
50%6.7000005.0000005.900000
75%7.2000005.2000006.200000
max7.9000005.8000007.000000
sepal_widthcount38.00000050.00000062.000000
mean3.0736843.4280002.748387
std0.2900920.3790640.296284
min2.5000002.3000002.000000
25%2.9250003.2000002.500000
50%3.0000003.4000002.800000
75%3.2000003.6750003.000000
max3.8000004.4000003.400000
petal_lengthcount38.00000050.00000062.000000
mean5.7421051.4620004.393548
std0.4885900.1736640.508895
min4.9000001.0000003.000000
25%5.4250001.4000004.025000
50%5.6500001.5000004.500000
75%6.0000001.5750004.800000
max6.9000001.9000005.100000
petal_widthcount38.00000050.00000062.000000
mean2.0710530.2460001.433871
std0.2798720.1053860.297500
min1.4000000.1000001.000000
25%1.8250000.2000001.300000
50%2.1000000.2000001.400000
75%2.3000000.3000001.575000
max2.5000000.6000002.400000
\n", "
" ], "text/plain": [ "K 0 1 2\n", "sepal_length count 38.000000 50.000000 62.000000\n", " mean 6.850000 5.006000 5.901613\n", " std 0.494155 0.352490 0.466410\n", " min 6.100000 4.300000 4.900000\n", " 25% 6.425000 4.800000 5.600000\n", " 50% 6.700000 5.000000 5.900000\n", " 75% 7.200000 5.200000 6.200000\n", " max 7.900000 5.800000 7.000000\n", "sepal_width count 38.000000 50.000000 62.000000\n", " mean 3.073684 3.428000 2.748387\n", " std 0.290092 0.379064 0.296284\n", " min 2.500000 2.300000 2.000000\n", " 25% 2.925000 3.200000 2.500000\n", " 50% 3.000000 3.400000 2.800000\n", " 75% 3.200000 3.675000 3.000000\n", " max 3.800000 4.400000 3.400000\n", "petal_length count 38.000000 50.000000 62.000000\n", " mean 5.742105 1.462000 4.393548\n", " std 0.488590 0.173664 0.508895\n", " min 4.900000 1.000000 3.000000\n", " 25% 5.425000 1.400000 4.025000\n", " 50% 5.650000 1.500000 4.500000\n", " 75% 6.000000 1.575000 4.800000\n", " max 6.900000 1.900000 5.100000\n", "petal_width count 38.000000 50.000000 62.000000\n", " mean 2.071053 0.246000 1.433871\n", " std 0.279872 0.105386 0.297500\n", " min 1.400000 0.100000 1.000000\n", " 25% 1.825000 0.200000 1.300000\n", " 50% 2.100000 0.200000 1.400000\n", " 75% 2.300000 0.300000 1.575000\n", " max 2.500000 0.600000 2.400000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Tunnuslukuja ryhmittäin:\n", "X.groupby('K').describe().T" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
speciessetosaversicolorvirginica
K
00236
15000
204814
\n", "