{ "cells": [ { "cell_type": "markdown", "id": "4458df13-d0f7-462e-bc80-42169bb1a62b", "metadata": {}, "source": [ "This is a starter notebook for an updated module 5 of ML Zoomcamp\n", "\n", "The code is based on the modules 3 and 4. We use the same dataset: [telco customer churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)" ] }, { "cell_type": "code", "execution_count": 1, "id": "a16177e8-cbd2-4088-9bb0-07a0cfb3eee6", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import sklearn" ] }, { "cell_type": "code", "execution_count": 2, "id": "498798c7-1848-47f0-9789-5881ae3658bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pandas==2.3.1\n", "numpy==2.3.1\n", "sklearn==1.7.0\n" ] } ], "source": [ "print(f'pandas=={pd.__version__}')\n", "print(f'numpy=={np.__version__}')\n", "print(f'sklearn=={sklearn.__version__}')" ] }, { "cell_type": "code", "execution_count": 3, "id": "8afe407b-b259-47ca-a4df-0f098df8435b", "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction import DictVectorizer\n", "from sklearn.linear_model import LogisticRegression" ] }, { "cell_type": "code", "execution_count": 4, "id": "e9e9464c-d8ed-45ea-9e8c-70e6d73842f7", "metadata": {}, "outputs": [], "source": [ "data_url = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'\n", "\n", "df = pd.read_csv(data_url)\n", "\n", "df.columns = df.columns.str.lower().str.replace(' ', '_')\n", "\n", "categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)\n", "\n", "for c in categorical_columns:\n", " df[c] = df[c].str.lower().str.replace(' ', '_')\n", "\n", "df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')\n", "df.totalcharges = df.totalcharges.fillna(0)\n", "\n", "df.churn = (df.churn == 'yes').astype(int)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8ca89823-6714-42b7-a9a7-d0db94898aee", "metadata": {}, "outputs": [], "source": [ "y_train = df.churn" ] }, { "cell_type": "code", "execution_count": 6, "id": "4d704727-25c1-4000-bbb4-6390bfad72f8", "metadata": {}, "outputs": [], "source": [ "numerical = ['tenure', 'monthlycharges', 'totalcharges']\n", "\n", "categorical = [\n", " 'gender',\n", " 'seniorcitizen',\n", " 'partner',\n", " 'dependents',\n", " 'phoneservice',\n", " 'multiplelines',\n", " 'internetservice',\n", " 'onlinesecurity',\n", " 'onlinebackup',\n", " 'deviceprotection',\n", " 'techsupport',\n", " 'streamingtv',\n", " 'streamingmovies',\n", " 'contract',\n", " 'paperlessbilling',\n", " 'paymentmethod',\n", "]" ] }, { "cell_type": "code", "execution_count": 7, "id": "b2cba1c9-b346-428e-976e-649f1f9bcc72", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression(solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression(solver='liblinear')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dv = DictVectorizer()\n", "\n", "train_dict = df[categorical + numerical].to_dict(orient='records')\n", "X_train = dv.fit_transform(train_dict)\n", "\n", "model = LogisticRegression(solver='liblinear')\n", "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "80f2002c-433b-4e77-9df7-965839859d4a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }