{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Päivitetty 2022-09-11 12:52:08.931765\n" ] } ], "source": [ "from datetime import datetime\n", "print(f'Päivitetty {datetime.now()}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Luokittelu - kategorisia muuttujia, tasapainotus\n", "\n", "Data löytyy esimerkiksi lähteestä:\n", "https://archive.ics.uci.edu/ml/datasets/bank+marketing\n", "\n", "Kohdemuuttuja y sisältää tiedon siitä, onko asiakkaalla määräaikaistalletuksia.\n", "\n", "Selvitetään voidaanko asiakastietojen perusteella ennustaa y-muuttujan arvoja.\n", "\n", "Mukana on kategorisia muuttujia, jotka täytyy muuntaa dummy-muuttujiksi." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# Datan tasapainottamiseen\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", "\n", "# Datan kaikki sarakkeet näkyviin\n", "pd.options.display.max_columns = None" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Datan tarkastelua" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | age | \n", "job | \n", "marital | \n", "education | \n", "default | \n", "housing | \n", "loan | \n", "contact | \n", "month | \n", "day_of_week | \n", "duration | \n", "campaign | \n", "pdays | \n", "previous | \n", "poutcome | \n", "emp_var_rate | \n", "cons_price_idx | \n", "cons_conf_idx | \n", "euribor3m | \n", "nr_employed | \n", "y | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "44 | \n", "blue-collar | \n", "married | \n", "basic.4y | \n", "unknown | \n", "yes | \n", "no | \n", "cellular | \n", "aug | \n", "thu | \n", "210 | \n", "1 | \n", "999 | \n", "0 | \n", "nonexistent | \n", "1.4 | \n", "93.444 | \n", "-36.1 | \n", "4.963 | \n", "5228.1 | \n", "0 | \n", "
1 | \n", "53 | \n", "technician | \n", "married | \n", "unknown | \n", "no | \n", "no | \n", "no | \n", "cellular | \n", "nov | \n", "fri | \n", "138 | \n", "1 | \n", "999 | \n", "0 | \n", "nonexistent | \n", "-0.1 | \n", "93.200 | \n", "-42.0 | \n", "4.021 | \n", "5195.8 | \n", "0 | \n", "
2 | \n", "28 | \n", "management | \n", "single | \n", "university.degree | \n", "no | \n", "yes | \n", "no | \n", "cellular | \n", "jun | \n", "thu | \n", "339 | \n", "3 | \n", "6 | \n", "2 | \n", "success | \n", "-1.7 | \n", "94.055 | \n", "-39.8 | \n", "0.729 | \n", "4991.6 | \n", "1 | \n", "
3 | \n", "39 | \n", "services | \n", "married | \n", "high.school | \n", "no | \n", "no | \n", "no | \n", "cellular | \n", "apr | \n", "fri | \n", "185 | \n", "2 | \n", "999 | \n", "0 | \n", "nonexistent | \n", "-1.8 | \n", "93.075 | \n", "-47.1 | \n", "1.405 | \n", "5099.1 | \n", "0 | \n", "
4 | \n", "55 | \n", "retired | \n", "married | \n", "basic.4y | \n", "no | \n", "yes | \n", "no | \n", "cellular | \n", "aug | \n", "fri | \n", "137 | \n", "1 | \n", "3 | \n", "1 | \n", "success | \n", "-2.9 | \n", "92.201 | \n", "-31.4 | \n", "0.869 | \n", "5076.2 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
41183 | \n", "59 | \n", "retired | \n", "married | \n", "high.school | \n", "unknown | \n", "no | \n", "yes | \n", "telephone | \n", "jun | \n", "thu | \n", "222 | \n", "1 | \n", "999 | \n", "0 | \n", "nonexistent | \n", "1.4 | \n", "94.465 | \n", "-41.8 | \n", "4.866 | \n", "5228.1 | \n", "0 | \n", "
41184 | \n", "31 | \n", "housemaid | \n", "married | \n", "basic.4y | \n", "unknown | \n", "no | \n", "no | \n", "telephone | \n", "may | \n", "thu | \n", "196 | \n", "2 | \n", "999 | \n", "0 | \n", "nonexistent | \n", "1.1 | \n", "93.994 | \n", "-36.4 | \n", "4.860 | \n", "5191.0 | \n", "0 | \n", "
41185 | \n", "42 | \n", "admin. | \n", "single | \n", "university.degree | \n", "unknown | \n", "yes | \n", "yes | \n", "telephone | \n", "may | \n", "wed | \n", "62 | \n", "3 | \n", "999 | \n", "0 | \n", "nonexistent | \n", "1.1 | \n", "93.994 | \n", "-36.4 | \n", "4.857 | \n", "5191.0 | \n", "0 | \n", "
41186 | \n", "48 | \n", "technician | \n", "married | \n", "professional.course | \n", "no | \n", "no | \n", "yes | \n", "telephone | \n", "oct | \n", "tue | \n", "200 | \n", "2 | \n", "999 | \n", "0 | \n", "nonexistent | \n", "-3.4 | \n", "92.431 | \n", "-26.9 | \n", "0.742 | \n", "5017.5 | \n", "0 | \n", "
41187 | \n", "25 | \n", "student | \n", "single | \n", "high.school | \n", "no | \n", "no | \n", "no | \n", "telephone | \n", "may | \n", "fri | \n", "112 | \n", "4 | \n", "999 | \n", "0 | \n", "nonexistent | \n", "1.1 | \n", "93.994 | \n", "-36.4 | \n", "4.859 | \n", "5191.0 | \n", "0 | \n", "
41188 rows × 21 columns
\n", "