{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import json\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "# from geopy.distance import distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[PosixPath('data/ny.html'),\n",
       " PosixPath('data/houston_processed.feather'),\n",
       " PosixPath('data/florida.html'),\n",
       " PosixPath('data/tx.html'),\n",
       " PosixPath('data/houston_ready.feather'),\n",
       " PosixPath('data/houston.html'),\n",
       " PosixPath('data/location_history.json'),\n",
       " PosixPath('data/houston_processed_miles_time_diff.feather')]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PATH = Path('data')\n",
    "list(PATH.iterdir())\n",
    "\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "\n",
    "# from mpl_toolkits.basemap import Basemap\n",
    "import folium\n",
    "from folium.plugins import MarkerCluster,FastMarkerCluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_feather(PATH/'houston_ready.feather')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# Density-based Spatial Clustering of Applications with Noise (DBSCAN)\n",
    "\n",
    "This algorithm is similar to mean-shift (density-based). One different thing about DBSCAN is how it processes outliers: Meanshift includes outliers into a cluster, meanwhile DBSCAN identifies them as noises\n",
    "\n",
    "Pros: \n",
    "- no need to specify number of clusters\n",
    "- good for data with similar density\n",
    "- can handle arbitrary distance functions, such as haversine/geodetic distance functions -> more suitable for this GPS dataset \n",
    "\n",
    "Cons:\n",
    "- does not work well with varying density clusters (harder to estimate distance threshold and min points)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_small = df.iloc[-100000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "coords = df[['latitude','longitude']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import DBSCAN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "kms_per_radian = 6371.0088\n",
    "epsilon = 2 / kms_per_radian"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "db = DBSCAN(eps=epsilon, min_samples=100, algorithm='ball_tree', metric='haversine',n_jobs=-1)\n",
    "cluster = db.fit_predict(np.radians(coords))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}