{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clustering words or categories"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook is at: https://ageo.co/YywklD"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## K-modes\n",
    "\n",
    "First...\n",
    "\n",
    "    pip install kmodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = [[1,1,2,3],\n",
    "     [1,1,1,1],\n",
    "     [2,2,2,2],\n",
    "     [2,2,3,3],\n",
    "     [1,3,3,3],\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Init: initializing centroids\n",
      "Init: initializing clusters\n",
      "Starting iterations...\n",
      "Run 1, iteration: 1/100, moves: 1, cost: 4.0\n",
      "Run 1, iteration: 2/100, moves: 0, cost: 4.0\n",
      "Init: initializing centroids\n",
      "Init: initializing clusters\n",
      "Starting iterations...\n",
      "Run 2, iteration: 1/100, moves: 0, cost: 4.0\n",
      "Init: initializing centroids\n",
      "Init: initializing clusters\n",
      "Starting iterations...\n",
      "Run 3, iteration: 1/100, moves: 0, cost: 4.0\n",
      "Init: initializing centroids\n",
      "Init: initializing clusters\n",
      "Starting iterations...\n",
      "Run 4, iteration: 1/100, moves: 0, cost: 4.0\n",
      "Init: initializing centroids\n",
      "Init: initializing clusters\n",
      "Starting iterations...\n",
      "Run 5, iteration: 1/100, moves: 0, cost: 4.0\n",
      "Best run was number 1\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([0, 0, 2, 1, 1], dtype=uint16)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from kmodes.kmodes import KModes\n",
    "\n",
    "km = KModes(n_clusters=3, init='Huang', n_init=5, verbose=1)\n",
    "\n",
    "clusters = km.fit(X)\n",
    "\n",
    "km.labels_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That's... something."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DBSCAN with Levenshtein distance\n",
    "\n",
    "Or any function that provides a 'distance' between two sequences. They can even be of different length (as with Levenshtein)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def levenshtein(seq1, seq2):\n",
    "    oneago = None\n",
    "    thisrow = list(range(1, len(seq2) + 1)) + [0]\n",
    "    for x in range(len(seq1)):\n",
    "        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]\n",
    "        for y in range(len(seq2)):\n",
    "            delcost = oneago[y] + 1\n",
    "            addcost = thisrow[y - 1] + 1\n",
    "            subcost = oneago[y - 1] + (seq1[x] != seq2[y])\n",
    "            thisrow[y] = min(delcost, addcost, subcost)\n",
    "    return thisrow[len(seq2) - 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "levenshtein('hi','pie')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = ['hello', 'halo', 'hullo', 'hi', 'pie', 'py', 'my']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0, 1, 2, 3, 4, 5, 6]), array([0, 0, 0, 1, 1, 1, 1]))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.cluster import dbscan\n",
    "\n",
    "def lev_metric(x, y):\n",
    "    i, j = int(x[0]), int(y[0])\n",
    "    return levenshtein(data[i], data[j])\n",
    "\n",
    "X = [[i] for i in range(len(data))]\n",
    "\n",
    "clustering = dbscan(X, metric=lev_metric, eps=2, min_samples=2)\n",
    "\n",
    "clustering"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "geocomp",
   "language": "python",
   "name": "geocomp"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}