{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Mobile app churn prediction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's import necessary libs + function for working with Clickhouse data" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": true }, "outputs": [], "source": [ "HOST = 'http://localhost:8123'\n", "import requests\n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "try:\n", " from StringIO import StringIO\n", "except ImportError:\n", " from io import StringIO\n", "import seaborn as sns\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import datetime\n", "%matplotlib inline\n", "\n", "# get rid of scientific notation\n", "pd.options.display.float_format = '{:.2f}'.format\n", "\n", "# disable warnings\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "import itertools\n", "from pylab import rcParams\n", "rcParams['figure.figsize'] = 12, 8\n", "\n", "# get query results from DataBase\n", "def get_clickhouse_data(query, host = HOST, connection_timeout = 1500):\n", " r = requests.post(host, params = {'query': query}, timeout = connection_timeout)\n", " if r.status_code == 200:\n", " return r.text\n", " else:\n", " raise ValueError(r.text)\n", " \n", "# convert query results to pandas DataFrame \n", "def get_clickhouse_df(query, host = HOST, connection_timeout = 1500):\n", " data = get_clickhouse_data(query, host, connection_timeout) \n", " df = pd.read_csv(StringIO(data), sep = '\\t')\n", " return df\n", "\n", "query_dates = ['2018-06-01', '2018-07-31'];\n", "\n", "churn_query_dates = ['2018-08-01', '2018-08-31'];\n", "\n", "# date format\n", "fmt = '%Y-%m-%d'\n", "\n", "# a function for calculation some of the features related to users' session activity\n", "def func(a):\n", " i1, i2 = itertools.tee(iter(a))\n", " next(i2)\n", " res = [(int(round((datetime.strptime(y, fmt) - datetime.strptime(x, fmt)).total_seconds() / 60))) / 1440\n", " for x, y in zip(i1, i2)] # or just zip in Python 3\n", " return round(np.mean(res),2)\n", "\n", "from datetime import datetime\n", "\n", "reportDate = datetime(2018, 8, 1)\n", "\n", "sns.set(rc={'axes.facecolor':'grey', 'figure.facecolor':'grey'})\n", "sns.set(palette='Blues')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SESSION DATA" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's get users' session data. In my case I neeed data for 4 months (april through july 2018)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 11.8 s, sys: 5.01 s, total: 16.8 s\n", "Wall time: 19.2 s\n" ] } ], "source": [ "%%time\n", "\n", "q = '''\n", "SELECT\n", " DeviceID,\n", " AppVersionName,\n", " OSName,\n", " City,\n", " SessionStartDate,\n", " SessionStartDateTime,\n", " SessionStartTimestamp\n", "FROM\n", " mobile.sessions_starts_all\n", "WHERE\n", " SessionStartDate BETWEEN ''' + ''' \\'''' + \\\n", " query_dates[0] + '''\\' AND ''' + '''\\'''' + \\\n", " query_dates[1] + '''\\' ''''FORMAT TabSeparatedWithNames'''\n", "\n", "sessions = get_clickhouse_df(q)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add session data for may and april 2018" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sessions_may = pd.read_csv('../dmitriilin/Downloads/sessions_starts_may.csv', sep=',')\n", "sessions_april = pd.read_csv('../dmitriilin/Downloads/sessions_starts_april.csv', sep=',')\n", "sessions_may['SessionStartDate'] = sessions_may['SessionStartDateTime'].apply(lambda x: x.split(' ')[0])\n", "frames = [sessions, sessions_may, sessions_april]\n", "sessions = pd.concat(frames)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is how the dataFrame looks like" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | AppVersionName | \n", "City | \n", "DeviceID | \n", "OSName | \n", "SessionStartDate | \n", "SessionStartDateTime | \n", "SessionStartTimestamp | \n", "Unnamed: 0 | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "3.17.2 | \n", "Kazan | \n", "7543012021622200292 | \n", "ios | \n", "2018-06-01 | \n", "2018-06-01 11:57:29 | \n", "1527854249 | \n", "nan | \n", "
1 | \n", "3.17.1 | \n", "Samara | \n", "16883537386825900026 | \n", "android | \n", "2018-06-01 | \n", "2018-06-01 11:18:10 | \n", "1527851890 | \n", "nan | \n", "
2 | \n", "3.17.1 | \n", "Samara | \n", "16883537386825900026 | \n", "android | \n", "2018-06-01 | \n", "2018-06-01 07:11:19 | \n", "1527837079 | \n", "nan | \n", "
3 | \n", "3.18.0 | \n", "Kazan | \n", "170802125429386870 | \n", "android | \n", "2018-06-01 | \n", "2018-06-01 08:03:44 | \n", "1527840224 | \n", "nan | \n", "
4 | \n", "3.18.0 | \n", "Kazan | \n", "170802125429386870 | \n", "android | \n", "2018-06-01 | \n", "2018-06-01 11:31:19 | \n", "1527852679 | \n", "nan | \n", "
\n", " | DeviceID | \n", "sessions_totalnumber | \n", "sessions_avgdaylag | \n", "sessions_daysSinceLastSession | \n", "sessions_totaldaysactive | \n", "payments_operationAmount_count | \n", "payments_operationAmount_sum | \n", "payments_operationAmount_median | \n", "payments_operationAmount_min | \n", "payments_operationAmount_max | \n", "... | \n", "transfers_operationFee_sum | \n", "transfers_sender_nunique | \n", "transfers_recipient_nunique | \n", "transfers_operationMethod_nunique | \n", "transfers_senderCurrency_nunique | \n", "transfers_recipientCurrency_nunique | \n", "transfers_avgdaylag | \n", "transfers_daysactive | \n", "transfers_daysSinceLastTransfer | \n", "churned | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
107876 | \n", "10313336111367727706 | \n", "1 | \n", "53.00 | \n", "53 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "... | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "122.00 | \n", "122.00 | \n", "122.00 | \n", "1 | \n", "
68108 | \n", "6514306888048145041 | \n", "1 | \n", "114.00 | \n", "114 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "... | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "122.00 | \n", "122.00 | \n", "122.00 | \n", "1 | \n", "
38900 | \n", "3725470542799209790 | \n", "1 | \n", "59.00 | \n", "59 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "... | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "122.00 | \n", "122.00 | \n", "122.00 | \n", "0 | \n", "
107882 | \n", "10313953945953922630 | \n", "1 | \n", "122.00 | \n", "122 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "... | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "122.00 | \n", "122.00 | \n", "122.00 | \n", "1 | \n", "
45952 | \n", "4396644337809361834 | \n", "1 | \n", "10.00 | \n", "10 | \n", "1 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "... | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "122.00 | \n", "122.00 | \n", "122.00 | \n", "1 | \n", "
5 rows × 31 columns
\n", "