{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Päivitetty 2022-10-15 12:17:05.441070\n"
]
}
],
"source": [
"from datetime import datetime\n",
"print(f'Päivitetty {datetime.now()}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# K-means"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Katsotaan osaako K-means klusterointi jakaa kurjenmiekat kolmeen lajiin (setosa, versicolor, virginica) \n",
"terä- (petal) ja verholehtien (sepal) koon mukaan."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"\n",
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" species | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 145 | \n",
" 6.7 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.3 | \n",
" virginica | \n",
"
\n",
" \n",
" 146 | \n",
" 6.3 | \n",
" 2.5 | \n",
" 5.0 | \n",
" 1.9 | \n",
" virginica | \n",
"
\n",
" \n",
" 147 | \n",
" 6.5 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.0 | \n",
" virginica | \n",
"
\n",
" \n",
" 148 | \n",
" 6.2 | \n",
" 3.4 | \n",
" 5.4 | \n",
" 2.3 | \n",
" virginica | \n",
"
\n",
" \n",
" 149 | \n",
" 5.9 | \n",
" 3.0 | \n",
" 5.1 | \n",
" 1.8 | \n",
" virginica | \n",
"
\n",
" \n",
"
\n",
"
150 rows × 5 columns
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width species\n",
"0 5.1 3.5 1.4 0.2 setosa\n",
"1 4.9 3.0 1.4 0.2 setosa\n",
"2 4.7 3.2 1.3 0.2 setosa\n",
"3 4.6 3.1 1.5 0.2 setosa\n",
"4 5.0 3.6 1.4 0.2 setosa\n",
".. ... ... ... ... ...\n",
"145 6.7 3.0 5.2 2.3 virginica\n",
"146 6.3 2.5 5.0 1.9 virginica\n",
"147 6.5 3.0 5.2 2.0 virginica\n",
"148 6.2 3.4 5.4 2.3 virginica\n",
"149 5.9 3.0 5.1 1.8 virginica\n",
"\n",
"[150 rows x 5 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Esimerkkiaineisto löytyy seaborn-kirjastosta\n",
"iris = sns.load_dataset('iris')\n",
"iris"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# X sisältää vain terä- ja verholehtien pituudet ja leveydet\n",
"X = iris.drop('species', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"KMeans(n_clusters=3, random_state=2)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Mallin sovitus\n",
"\n",
"kmeans = KMeans(n_clusters=3, random_state=2)\n",
"kmeans.fit(X)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
" species | \n",
" prediction | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.1 | \n",
" 3.5 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.9 | \n",
" 3.0 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 4.7 | \n",
" 3.2 | \n",
" 1.3 | \n",
" 0.2 | \n",
" setosa | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.6 | \n",
" 3.1 | \n",
" 1.5 | \n",
" 0.2 | \n",
" setosa | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5.0 | \n",
" 3.6 | \n",
" 1.4 | \n",
" 0.2 | \n",
" setosa | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 145 | \n",
" 6.7 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.3 | \n",
" virginica | \n",
" 2 | \n",
"
\n",
" \n",
" 146 | \n",
" 6.3 | \n",
" 2.5 | \n",
" 5.0 | \n",
" 1.9 | \n",
" virginica | \n",
" 1 | \n",
"
\n",
" \n",
" 147 | \n",
" 6.5 | \n",
" 3.0 | \n",
" 5.2 | \n",
" 2.0 | \n",
" virginica | \n",
" 2 | \n",
"
\n",
" \n",
" 148 | \n",
" 6.2 | \n",
" 3.4 | \n",
" 5.4 | \n",
" 2.3 | \n",
" virginica | \n",
" 2 | \n",
"
\n",
" \n",
" 149 | \n",
" 5.9 | \n",
" 3.0 | \n",
" 5.1 | \n",
" 1.8 | \n",
" virginica | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
150 rows × 6 columns
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width species \\\n",
"0 5.1 3.5 1.4 0.2 setosa \n",
"1 4.9 3.0 1.4 0.2 setosa \n",
"2 4.7 3.2 1.3 0.2 setosa \n",
"3 4.6 3.1 1.5 0.2 setosa \n",
"4 5.0 3.6 1.4 0.2 setosa \n",
".. ... ... ... ... ... \n",
"145 6.7 3.0 5.2 2.3 virginica \n",
"146 6.3 2.5 5.0 1.9 virginica \n",
"147 6.5 3.0 5.2 2.0 virginica \n",
"148 6.2 3.4 5.4 2.3 virginica \n",
"149 5.9 3.0 5.1 1.8 virginica \n",
"\n",
" prediction \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
".. ... \n",
"145 2 \n",
"146 1 \n",
"147 2 \n",
"148 2 \n",
"149 1 \n",
"\n",
"[150 rows x 6 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Klustereiden numerot (0, 1 ja 2) alkuperäiseen iris-dataan\n",
"iris['prediction'] = kmeans.predict(X)\n",
"iris"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" prediction | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" species | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" setosa | \n",
" 50 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" versicolor | \n",
" 0 | \n",
" 48 | \n",
" 2 | \n",
"
\n",
" \n",
" virginica | \n",
" 0 | \n",
" 14 | \n",
" 36 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"prediction 0 1 2\n",
"species \n",
"setosa 50 0 0\n",
"versicolor 0 48 2\n",
"virginica 0 14 36"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Katsotaan ristiintaulukoimalla miten hyvin clusterit vastaavat lajikkeita\n",
"pd.crosstab(iris['species'], iris['prediction'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sepal_length | \n",
" sepal_width | \n",
" petal_length | \n",
" petal_width | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 5.006000 | \n",
" 3.428000 | \n",
" 1.462000 | \n",
" 0.246000 | \n",
"
\n",
" \n",
" 1 | \n",
" 5.901613 | \n",
" 2.748387 | \n",
" 4.393548 | \n",
" 1.433871 | \n",
"
\n",
" \n",
" 2 | \n",
" 6.850000 | \n",
" 3.073684 | \n",
" 5.742105 | \n",
" 2.071053 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sepal_length sepal_width petal_length petal_width\n",
"0 5.006000 3.428000 1.462000 0.246000\n",
"1 5.901613 2.748387 4.393548 1.433871\n",
"2 6.850000 3.073684 5.742105 2.071053"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Klustereiden keskikohdat\n",
"pd.DataFrame(kmeans.cluster_centers_, \n",
" columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}