{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Päivitetty 2022-10-15 12:17:05.441070\n" ] } ], "source": [ "from datetime import datetime\n", "print(f'Päivitetty {datetime.now()}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# K-means" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Katsotaan osaako K-means klusterointi jakaa kurjenmiekat kolmeen lajiin (setosa, versicolor, virginica) \n", "terä- (petal) ja verholehtien (sepal) koon mukaan." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "\n", "from sklearn.cluster import KMeans" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
..................
1456.73.05.22.3virginica
1466.32.55.01.9virginica
1476.53.05.22.0virginica
1486.23.45.42.3virginica
1495.93.05.11.8virginica
\n", "

150 rows × 5 columns

\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa\n", ".. ... ... ... ... ...\n", "145 6.7 3.0 5.2 2.3 virginica\n", "146 6.3 2.5 5.0 1.9 virginica\n", "147 6.5 3.0 5.2 2.0 virginica\n", "148 6.2 3.4 5.4 2.3 virginica\n", "149 5.9 3.0 5.1 1.8 virginica\n", "\n", "[150 rows x 5 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Esimerkkiaineisto löytyy seaborn-kirjastosta\n", "iris = sns.load_dataset('iris')\n", "iris" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# X sisältää vain terä- ja verholehtien pituudet ja leveydet\n", "X = iris.drop('species', axis=1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "KMeans(n_clusters=3, random_state=2)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Mallin sovitus\n", "\n", "kmeans = KMeans(n_clusters=3, random_state=2)\n", "kmeans.fit(X)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspeciesprediction
05.13.51.40.2setosa0
14.93.01.40.2setosa0
24.73.21.30.2setosa0
34.63.11.50.2setosa0
45.03.61.40.2setosa0
.....................
1456.73.05.22.3virginica2
1466.32.55.01.9virginica1
1476.53.05.22.0virginica2
1486.23.45.42.3virginica2
1495.93.05.11.8virginica1
\n", "

150 rows × 6 columns

\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species \\\n", "0 5.1 3.5 1.4 0.2 setosa \n", "1 4.9 3.0 1.4 0.2 setosa \n", "2 4.7 3.2 1.3 0.2 setosa \n", "3 4.6 3.1 1.5 0.2 setosa \n", "4 5.0 3.6 1.4 0.2 setosa \n", ".. ... ... ... ... ... \n", "145 6.7 3.0 5.2 2.3 virginica \n", "146 6.3 2.5 5.0 1.9 virginica \n", "147 6.5 3.0 5.2 2.0 virginica \n", "148 6.2 3.4 5.4 2.3 virginica \n", "149 5.9 3.0 5.1 1.8 virginica \n", "\n", " prediction \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", ".. ... \n", "145 2 \n", "146 1 \n", "147 2 \n", "148 2 \n", "149 1 \n", "\n", "[150 rows x 6 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Klustereiden numerot (0, 1 ja 2) alkuperäiseen iris-dataan\n", "iris['prediction'] = kmeans.predict(X)\n", "iris" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prediction012
species
setosa5000
versicolor0482
virginica01436
\n", "
" ], "text/plain": [ "prediction 0 1 2\n", "species \n", "setosa 50 0 0\n", "versicolor 0 48 2\n", "virginica 0 14 36" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Katsotaan ristiintaulukoimalla miten hyvin clusterit vastaavat lajikkeita\n", "pd.crosstab(iris['species'], iris['prediction'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
05.0060003.4280001.4620000.246000
15.9016132.7483874.3935481.433871
26.8500003.0736845.7421052.071053
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "0 5.006000 3.428000 1.462000 0.246000\n", "1 5.901613 2.748387 4.393548 1.433871\n", "2 6.850000 3.073684 5.742105 2.071053" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Klustereiden keskikohdat\n", "pd.DataFrame(kmeans.cluster_centers_, \n", " columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 2 }