{ "cells": [ { "cell_type": "markdown", "id": "4458df13-d0f7-462e-bc80-42169bb1a62b", "metadata": {}, "source": [ "This is a starter notebook for an updated module 5 of ML Zoomcamp\n", "\n", "The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)" ] }, { "cell_type": "code", "execution_count": 1, "id": "a16177e8-cbd2-4088-9bb0-07a0cfb3eee6", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import sklearn" ] }, { "cell_type": "code", "execution_count": 2, "id": "498798c7-1848-47f0-9789-5881ae3658bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pandas==2.3.1\n", "numpy==2.3.1\n", "sklearn==1.7.0\n" ] } ], "source": [ "print(f'pandas=={pd.__version__}')\n", "print(f'numpy=={np.__version__}')\n", "print(f'sklearn=={sklearn.__version__}')" ] }, { "cell_type": "code", "execution_count": 3, "id": "8afe407b-b259-47ca-a4df-0f098df8435b", "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction import DictVectorizer\n", "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 4, "id": "e9e9464c-d8ed-45ea-9e8c-70e6d73842f7", "metadata": {}, "outputs": [], "source": [ "data_url = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'\n", "\n", "df = pd.read_csv(data_url)\n", "\n", "df.columns = df.columns.str.lower().str.replace(' ', '_')\n", "\n", "categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)\n", "\n", "for c in categorical_columns:\n", " df[c] = df[c].str.lower().str.replace(' ', '_')\n", "\n", "df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')\n", "df.totalcharges = df.totalcharges.fillna(0)\n", "\n", "df.churn = (df.churn == 'yes').astype(int)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8ca89823-6714-42b7-a9a7-d0db94898aee", "metadata": {}, "outputs": [], "source": [ "y_train = df.churn" ] }, { "cell_type": "code", "execution_count": 6, "id": "4d704727-25c1-4000-bbb4-6390bfad72f8", "metadata": {}, "outputs": [], "source": [ "numerical = ['tenure', 'monthlycharges', 'totalcharges']\n", "\n", "categorical = [\n", " 'gender',\n", " 'seniorcitizen',\n", " 'partner',\n", " 'dependents',\n", " 'phoneservice',\n", " 'multiplelines',\n", " 'internetservice',\n", " 'onlinesecurity',\n", " 'onlinebackup',\n", " 'deviceprotection',\n", " 'techsupport',\n", " 'streamingtv',\n", " 'streamingmovies',\n", " 'contract',\n", " 'paperlessbilling',\n", " 'paymentmethod',\n", "]" ] }, { "cell_type": "code", "execution_count": 7, "id": "b2cba1c9-b346-428e-976e-649f1f9bcc72", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression(solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
| \n", " | penalty | \n", "'l2' | \n", "
| \n", " | dual | \n", "False | \n", "
| \n", " | tol | \n", "0.0001 | \n", "
| \n", " | C | \n", "1.0 | \n", "
| \n", " | fit_intercept | \n", "True | \n", "
| \n", " | intercept_scaling | \n", "1 | \n", "
| \n", " | class_weight | \n", "None | \n", "
| \n", " | random_state | \n", "None | \n", "
| \n", " | solver | \n", "'liblinear' | \n", "
| \n", " | max_iter | \n", "100 | \n", "
| \n", " | multi_class | \n", "'deprecated' | \n", "
| \n", " | verbose | \n", "0 | \n", "
| \n", " | warm_start | \n", "False | \n", "
| \n", " | n_jobs | \n", "None | \n", "
| \n", " | l1_ratio | \n", "None | \n", "