{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Unsupervised modeling\n", "\n", "The goal of this notebook is to identify and extract clusters that can review user's habit and common routine, such as home/school/work clusters" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[PosixPath('data/ny.html'),\n", " PosixPath('data/houston_processed.feather'),\n", " PosixPath('data/florida.html'),\n", " PosixPath('data/tx.html'),\n", " PosixPath('data/houston_ready.feather'),\n", " PosixPath('data/houston.html'),\n", " PosixPath('data/location_history.json'),\n", " PosixPath('data/houston_processed_miles_time_diff.feather')]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from pathlib import Path\n", "import json\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "# from geopy.distance import distance\n", "\n", "PATH = Path('data')\n", "list(PATH.iterdir())\n", "\n", "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# from mpl_toolkits.basemap import Basemap\n", "import folium\n", "from folium.plugins import MarkerCluster,FastMarkerCluster\n", "\n", "from sklearn.cluster import KMeans\n", "k=5\n", "\n", "import matplotlib.cm as cmx\n", "import matplotlib.colors as mcolors\n", "# from cycler import cycler\n", "\n", "# def get_cmap(N):\n", "# color_norm = mcolors.Normalize(vmin=0, vmax=N-1)\n", "# return cmx.ScalarMappable(norm=color_norm, cmap='tab10').to_rgba\n", "# num_colr = k\n", "# cmap = get_cmap(num_colr)\n", "# colr_list = [cmap(float(x)) for x in range(num_colr)]\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = pd.read_feather(PATH/'houston_ready.feather')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
accuracy30211259
altitude000
headingNaNNaNNaN
velocityNaNNaNNaN
verticalAccuracyNaNNaNNaN
act_conf146NaN100
act_type1UNKNOWNNoneTILTING
act_cont227NaNNaN
act_type2IN_VEHICLENoneNone
extra_intValNaNNaNNaN
extra_nameNoneNoneNone
extra_typeNoneNoneNone
date_time2015-11-30 21:41:32.110000-06:002015-11-30 21:43:05.510000-06:002015-11-30 21:47:40.071000-06:00
year201520152015
month111111
day303030
day_of_week000
hour212121
minute414347
latitude29.689429.689329.6924
longitude-95.2712-95.2712-95.2805
cluster000
mile_diff00.004938730.599316
planeFalseFalseFalse
\n", "
" ], "text/plain": [ " 0 \\\n", "accuracy 30 \n", "altitude 0 \n", "heading NaN \n", "velocity NaN \n", "verticalAccuracy NaN \n", "act_conf1 46 \n", "act_type1 UNKNOWN \n", "act_cont2 27 \n", "act_type2 IN_VEHICLE \n", "extra_intVal NaN \n", "extra_name None \n", "extra_type None \n", "date_time 2015-11-30 21:41:32.110000-06:00 \n", "year 2015 \n", "month 11 \n", "day 30 \n", "day_of_week 0 \n", "hour 21 \n", "minute 41 \n", "latitude 29.6894 \n", "longitude -95.2712 \n", "cluster 0 \n", "mile_diff 0 \n", "plane False \n", "\n", " 1 \\\n", "accuracy 21 \n", "altitude 0 \n", "heading NaN \n", "velocity NaN \n", "verticalAccuracy NaN \n", "act_conf1 NaN \n", "act_type1 None \n", "act_cont2 NaN \n", "act_type2 None \n", "extra_intVal NaN \n", "extra_name None \n", "extra_type None \n", "date_time 2015-11-30 21:43:05.510000-06:00 \n", "year 2015 \n", "month 11 \n", "day 30 \n", "day_of_week 0 \n", "hour 21 \n", "minute 43 \n", "latitude 29.6893 \n", "longitude -95.2712 \n", "cluster 0 \n", "mile_diff 0.00493873 \n", "plane False \n", "\n", " 2 \n", "accuracy 1259 \n", "altitude 0 \n", "heading NaN \n", "velocity NaN \n", "verticalAccuracy NaN \n", "act_conf1 100 \n", "act_type1 TILTING \n", "act_cont2 NaN \n", "act_type2 None \n", "extra_intVal NaN \n", "extra_name None \n", "extra_type None \n", "date_time 2015-11-30 21:47:40.071000-06:00 \n", "year 2015 \n", "month 11 \n", "day 30 \n", "day_of_week 0 \n", "hour 21 \n", "minute 47 \n", "latitude 29.6924 \n", "longitude -95.2805 \n", "cluster 0 \n", "mile_diff 0.599316 \n", "plane False " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "(341045, 24)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(3).T\n", "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Clustering and plotting cluster" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig,ax = plt.subplots(figsize=(20,10))\n", "_=ax.scatter(df.longitude,df.latitude,c='blue',s=3,alpha=0.5)\n", "\n", "_=ax.set_ylabel('latitude')\n", "_=ax.set_xlabel('longitude')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's exclude flight GPS points" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# recalculate miles differences\n", "def haversine_array(lat1, lng1, lat2, lng2):\n", " lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))\n", " AVG_EARTH_RADIUS = 6371 # in km\n", " lat = lat2 - lat1\n", " lng = lng2 - lng1\n", " d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2\n", " h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))\n", " return h\n", "\n", "lat2 = df.latitude.values.tolist()\n", "long2 = df.longitude.values.tolist()\n", "\n", "lat1 = df.latitude.shift().values.tolist()\n", "lat1[0] = lat2[0]\n", "long1 = df.longitude.shift().values.tolist()\n", "long1[0] = long2[0]\n", "\n", "km_diff = haversine_array(lat1,long1,lat2,long2) \n", "df['mile_diff'] = km_diff * 0.621371 # to miles" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "year month day\n", "2016 4 18 19.314191\n", " 5 31 162.667397\n", " 6 3 708.936381\n", " 10 13 236.593397\n", " 12 21 26.120457\n", " 22 938.012779\n", " 24 139.839309\n", " 26 25.258437\n", " 29 95.911094\n", "2017 1 3 139.469908\n", "Name: mile_diff, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.mile_diff>12].groupby(['year','month','day']).mile_diff.mean()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df['plane'] = df.mile_diff>12" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# df.to_feather(PATH/'houston_ready.feather')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df_ground = df[~df.plane].copy().reset_index(drop=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply kmeans clustering" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "k=5\n", "kmeans = KMeans(n_clusters=k,random_state=42).fit(df_ground[['latitude','longitude']])\n", "df_ground['cluster']= kmeans.predict(df_ground[['latitude','longitude']])\n", "\n", "fig,ax = plt.subplots(figsize=(20,10))\n", "_=ax.scatter(df_ground.longitude,df_ground.latitude,c=df_ground.cluster,cmap='tab10',s=3,alpha=1)\n", "\n", "_=ax.set_ylabel('latitude')\n", "_=ax.set_xlabel('longitude')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Look like we still have some residuals of flight GPS left. We did get rid most of flight GPS to Florida (bottom right) and around Dallas (above dense Houston points).\n", "\n", "K-means did a good job on separate travelling points (outside of Houston). Let's plot each of them" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": false }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig,axes = plt.subplots(nrows=k,figsize=(15,8*k));\n", "for i in range(k):\n", " _=axes[i].scatter(df_ground[df_ground.cluster==i].longitude,df_ground[df_ground.cluster==i].latitude, c=colr_list[i],s=3,alpha=1)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After few k, k=5 seems to fit best as kmeans recognizes all the major states and cities I visited. Let's take a look at few of them" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Mapping GPS point with Folium" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## cluster 1: Florida" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8830, 24)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_flo = df_ground[df_ground.cluster==1].reset_index(drop=True)\n", "df_flo.shape" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "\n", "callback = \"\"\"\\\n", "function (row) {\n", " var icon, marker;\n", " icon = L.AwesomeMarkers.icon({\n", " icon: \"map-marker\", markerColor: \"red\"});\n", " marker = L.marker(new L.LatLng(row[0], row[1]));\n", " marker.setIcon(icon);\n", " return marker;\n", "};\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "FLORIDA=[26.492328,-80.497977]\n", "\n", " \n", "m_flo = folium.Map(location=FLORIDA,tiles='cartodbpositron',zoom_start=7)\n", "\n", "FastMarkerCluster(data=list(zip(df_flo.latitude.values,df_flo.longitude.values)),\n", " callback = callback).add_to(m_flo)\n", "\n", "# for lat,long in zip(df_flo.latitude.values,df_flo.longitude.values): \n", "# _=folium.CircleMarker([lat,long], radius=1,\n", "# color='#0080bb', fill_color='#0080bb').add_to(m_flo);\n", "\n", "folium.LayerControl().add_to(m_flo)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# m_flo.save(str(PATH/'florida.html'))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m_flo" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## cluster 2: NY" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3165, 24)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ny = df_ground[df_ground.cluster==2].reset_index(drop=True)\n", "df_ny.shape" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NY=[40.735954,-73.993896]\n", "\n", " \n", "m_ny = folium.Map(location=NY,tiles='cartodbpositron',zoom_start=10)\n", "\n", "FastMarkerCluster(data=list(zip(df_ny.latitude.values,df_ny.longitude.values)),\n", " callback = callback).add_to(m_ny)\n", "\n", "# for lat,long in zip(df_ny.latitude.values,df_ny.longitude.values): \n", "# _=folium.CircleMarker([lat,long], radius=1,\n", "# color='#0080bb', fill_color='#0080bb').add_to(m_ny);\n", "\n", "\n", "folium.LayerControl().add_to(m_ny)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# m_ny.save(str(PATH/'ny.html'))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m_ny" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cluster 3: Austin + San Antonio + Dallas" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7395, 24)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_tx = df_ground[df_ground.cluster==3].reset_index(drop=True)\n", "df_tx.shape" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "TX=[30.265253, -97.714187]\n", "\n", " \n", "m_tx = folium.Map(location=TX,tiles='cartodbpositron',zoom_start=7)\n", "\n", "FastMarkerCluster(data=list(zip(df_tx.latitude.values,df_tx.longitude.values)),\n", " callback = callback).add_to(m_tx)\n", "\n", "# for lat,long in zip(df_tx.latitude.values,df_tx.longitude.values): \n", "# _=folium.CircleMarker([lat,long], radius=1,\n", "# color='#0080bb', fill_color='#0080bb').add_to(m_tx);\n", "\n", "\n", "folium.LayerControl().add_to(m_tx)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# m_tx.save(str(PATH/'tx.html'))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m_tx" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cluster 4: Maryland" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(549, 24)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_md = df_ground[df_ground.cluster==4].reset_index(drop=True)\n", "df_md.shape" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "MD=[39.084967, -77.152883]\n", "\n", " \n", "m_md = folium.Map(location=MD,tiles='cartodbpositron',zoom_start=10)\n", "\n", "FastMarkerCluster(data=list(zip(df_md.latitude.values,df_md.longitude.values)),\n", " callback = callback).add_to(m_md)\n", "\n", "# for lat,long in zip(df_tx.latitude.values,df_tx.longitude.values): \n", "# _=folium.CircleMarker([lat,long], radius=1,\n", "# color='#0080bb', fill_color='#0080bb').add_to(m_tx);\n", "\n", "\n", "folium.LayerControl().add_to(m_md)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "m_md" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Main target: hometown Houston" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(321048, 24)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_h = df_ground[df_ground.cluster==0].reset_index(drop=True)\n", "df_h.shape" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# plot fewer data points for folium\n", "n=50000\n", "idxs = np.random.permutation(len(df_h))[:n]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "H=[29.766672, -95.339652]\n", "\n", " \n", "m_h = folium.Map(location=H,tiles='cartodbpositron',zoom_start=10)\n", "\n", "FastMarkerCluster(data=list(zip(df_h.loc[idxs,:].latitude.values,df_h.loc[idxs,:].longitude.values)),\n", " callback = callback).add_to(m_h)\n", "\n", "# for lat,long in zip(df_h.latitude.values,df_h.longitude.values): \n", "# _=folium.CircleMarker([lat,long], radius=1,\n", "# color='#0080bb', fill_color='#0080bb').add_to(m_h);\n", "\n", "\n", "folium.LayerControl().add_to(m_h)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# m_h.save(str(PATH/'houston.html'))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "