{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Päivitetty 2025-02-26 14:20:59.097317\n"
]
}
],
"source": [
"from datetime import datetime\n",
"print(f'Päivitetty {datetime.now()}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# K-means"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Katsotaan osaako K-means klusterointi jakaa kurjenmiekat kolmeen lajiin (setosa, versicolor, virginica) \n",
"terä- (petal) ja verholehtien (sepal) koon mukaan."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import os\n",
"\n",
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" species | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 145 | \n",
" 6.7 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.3 | \n",
" virginica | \n",
"
\n",
" \n",
" | 146 | \n",
" 6.3 | \n",
" 2.5 | \n",
" 5.0 | \n",
" 1.9 | \n",
" virginica | \n",
"
\n",
" \n",
" | 147 | \n",
" 6.5 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.0 | \n",
" virginica | \n",
"
\n",
" \n",
" | 148 | \n",
" 6.2 | \n",
" 3.4 | \n",
" 5.4 | \n",
" 2.3 | \n",
" virginica | \n",
"
\n",
" \n",
" | 149 | \n",
" 5.9 | \n",
" 3.0 | \n",
" 5.1 | \n",
" 1.8 | \n",
" virginica | \n",
"
\n",
" \n",
"
\n",
"
150 rows × 5 columns
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width species\n",
"0 5.1 3.5 1.4 0.2 setosa\n",
"1 4.9 3.0 1.4 0.2 setosa\n",
"2 4.7 3.2 1.3 0.2 setosa\n",
"3 4.6 3.1 1.5 0.2 setosa\n",
"4 5.0 3.6 1.4 0.2 setosa\n",
".. ... ... ... ... ...\n",
"145 6.7 3.0 5.2 2.3 virginica\n",
"146 6.3 2.5 5.0 1.9 virginica\n",
"147 6.5 3.0 5.2 2.0 virginica\n",
"148 6.2 3.4 5.4 2.3 virginica\n",
"149 5.9 3.0 5.1 1.8 virginica\n",
"\n",
"[150 rows x 5 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Esimerkkiaineisto löytyy seaborn-kirjastosta\n",
"iris = sns.load_dataset('iris')\n",
"iris"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# X sisältää vain terä- ja verholehtien pituudet ja leveydet\n",
"X = iris.drop('species', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\akita\\miniconda3\\Lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1419: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"KMeans(n_clusters=3, random_state=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"KMeans(n_clusters=3, random_state=2)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Mallin sovitus\n",
"\n",
"os.environ['OMP_NUM_THREADS'] = '1' # tämä liittyy alla näkyvään varoitukseen Windows-koneilla\n",
"kmeans = KMeans(n_clusters=3, random_state=2)\n",
"kmeans.fit(X)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" species | \n",
" prediction | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" setosa | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 1 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 145 | \n",
" 6.7 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.3 | \n",
" virginica | \n",
" 2 | \n",
"
\n",
" \n",
" | 146 | \n",
" 6.3 | \n",
" 2.5 | \n",
" 5.0 | \n",
" 1.9 | \n",
" virginica | \n",
" 0 | \n",
"
\n",
" \n",
" | 147 | \n",
" 6.5 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.0 | \n",
" virginica | \n",
" 2 | \n",
"
\n",
" \n",
" | 148 | \n",
" 6.2 | \n",
" 3.4 | \n",
" 5.4 | \n",
" 2.3 | \n",
" virginica | \n",
" 2 | \n",
"
\n",
" \n",
" | 149 | \n",
" 5.9 | \n",
" 3.0 | \n",
" 5.1 | \n",
" 1.8 | \n",
" virginica | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
150 rows × 6 columns
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width species \\\n",
"0 5.1 3.5 1.4 0.2 setosa \n",
"1 4.9 3.0 1.4 0.2 setosa \n",
"2 4.7 3.2 1.3 0.2 setosa \n",
"3 4.6 3.1 1.5 0.2 setosa \n",
"4 5.0 3.6 1.4 0.2 setosa \n",
".. ... ... ... ... ... \n",
"145 6.7 3.0 5.2 2.3 virginica \n",
"146 6.3 2.5 5.0 1.9 virginica \n",
"147 6.5 3.0 5.2 2.0 virginica \n",
"148 6.2 3.4 5.4 2.3 virginica \n",
"149 5.9 3.0 5.1 1.8 virginica \n",
"\n",
" prediction \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
".. ... \n",
"145 2 \n",
"146 0 \n",
"147 2 \n",
"148 2 \n",
"149 0 \n",
"\n",
"[150 rows x 6 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Klustereiden numerot (0, 1 ja 2) alkuperäiseen iris-dataan\n",
"iris['prediction'] = kmeans.predict(X)\n",
"iris"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | prediction | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | species | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | setosa | \n",
" 0 | \n",
" 50 | \n",
" 0 | \n",
"
\n",
" \n",
" | versicolor | \n",
" 48 | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | virginica | \n",
" 14 | \n",
" 0 | \n",
" 36 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"prediction 0 1 2\n",
"species \n",
"setosa 0 50 0\n",
"versicolor 48 0 2\n",
"virginica 14 0 36"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Katsotaan ristiintaulukoimalla miten hyvin clusterit vastaavat lajikkeita\n",
"pd.crosstab(iris['species'], iris['prediction'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 5.901613 | \n",
" 2.748387 | \n",
" 4.393548 | \n",
" 1.433871 | \n",
"
\n",
" \n",
" | 1 | \n",
" 5.006000 | \n",
" 3.428000 | \n",
" 1.462000 | \n",
" 0.246000 | \n",
"
\n",
" \n",
" | 2 | \n",
" 6.850000 | \n",
" 3.073684 | \n",
" 5.742105 | \n",
" 2.071053 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width\n",
"0 5.901613 2.748387 4.393548 1.433871\n",
"1 5.006000 3.428000 1.462000 0.246000\n",
"2 6.850000 3.073684 5.742105 2.071053"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Klustereiden keskikohdat\n",
"pd.DataFrame(kmeans.cluster_centers_, \n",
" columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}