{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Päivitetty 2025-02-26 14:20:59.097317\n" ] } ], "source": [ "from datetime import datetime\n", "print(f'Päivitetty {datetime.now()}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# K-means" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Katsotaan osaako K-means klusterointi jakaa kurjenmiekat kolmeen lajiin (setosa, versicolor, virginica) \n", "terä- (petal) ja verholehtien (sepal) koon mukaan." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import os\n", "\n", "from sklearn.cluster import KMeans" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
..................
1456.73.05.22.3virginica
1466.32.55.01.9virginica
1476.53.05.22.0virginica
1486.23.45.42.3virginica
1495.93.05.11.8virginica
\n", "

150 rows × 5 columns

\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa\n", ".. ... ... ... ... ...\n", "145 6.7 3.0 5.2 2.3 virginica\n", "146 6.3 2.5 5.0 1.9 virginica\n", "147 6.5 3.0 5.2 2.0 virginica\n", "148 6.2 3.4 5.4 2.3 virginica\n", "149 5.9 3.0 5.1 1.8 virginica\n", "\n", "[150 rows x 5 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Esimerkkiaineisto löytyy seaborn-kirjastosta\n", "iris = sns.load_dataset('iris')\n", "iris" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# X sisältää vain terä- ja verholehtien pituudet ja leveydet\n", "X = iris.drop('species', axis=1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\akita\\miniconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "
KMeans(n_clusters=3, random_state=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "KMeans(n_clusters=3, random_state=2)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Mallin sovitus\n", "\n", "os.environ['OMP_NUM_THREADS'] = '1' # tämä liittyy alla näkyvään varoitukseen Windows-koneilla\n", "kmeans = KMeans(n_clusters=3, random_state=2)\n", "kmeans.fit(X)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspeciesprediction
05.13.51.40.2setosa1
14.93.01.40.2setosa1
24.73.21.30.2setosa1
34.63.11.50.2setosa1
45.03.61.40.2setosa1
.....................
1456.73.05.22.3virginica2
1466.32.55.01.9virginica0
1476.53.05.22.0virginica2
1486.23.45.42.3virginica2
1495.93.05.11.8virginica0
\n", "

150 rows × 6 columns

\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species \\\n", "0 5.1 3.5 1.4 0.2 setosa \n", "1 4.9 3.0 1.4 0.2 setosa \n", "2 4.7 3.2 1.3 0.2 setosa \n", "3 4.6 3.1 1.5 0.2 setosa \n", "4 5.0 3.6 1.4 0.2 setosa \n", ".. ... ... ... ... ... \n", "145 6.7 3.0 5.2 2.3 virginica \n", "146 6.3 2.5 5.0 1.9 virginica \n", "147 6.5 3.0 5.2 2.0 virginica \n", "148 6.2 3.4 5.4 2.3 virginica \n", "149 5.9 3.0 5.1 1.8 virginica \n", "\n", " prediction \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 \n", ".. ... \n", "145 2 \n", "146 0 \n", "147 2 \n", "148 2 \n", "149 0 \n", "\n", "[150 rows x 6 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Klustereiden numerot (0, 1 ja 2) alkuperäiseen iris-dataan\n", "iris['prediction'] = kmeans.predict(X)\n", "iris" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prediction012
species
setosa0500
versicolor4802
virginica14036
\n", "
" ], "text/plain": [ "prediction 0 1 2\n", "species \n", "setosa 0 50 0\n", "versicolor 48 0 2\n", "virginica 14 0 36" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Katsotaan ristiintaulukoimalla miten hyvin clusterit vastaavat lajikkeita\n", "pd.crosstab(iris['species'], iris['prediction'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
05.9016132.7483874.3935481.433871
15.0060003.4280001.4620000.246000
26.8500003.0736845.7421052.071053
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "0 5.901613 2.748387 4.393548 1.433871\n", "1 5.006000 3.428000 1.462000 0.246000\n", "2 6.850000 3.073684 5.742105 2.071053" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Klustereiden keskikohdat\n", "pd.DataFrame(kmeans.cluster_centers_, \n", " columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 4 }