{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['he has written two novels so far',\n",
       " 'two novels have been written by him so far',\n",
       " 'they will plant ten trees tomorrow',\n",
       " 'ten trees will be planted by them tomorrow',\n",
       " 'bruce writes a letter every week']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "data_train = [\n",
    "    \"He has written two novels so far\",\n",
    "    \"Two novels have been written by him so far\",\n",
    "    \"They will plant ten trees tomorrow\",\n",
    "    \"Ten trees will be planted by them tomorrow\",\n",
    "    \"Bruce writes a letter every week\",\n",
    "]\n",
    "#todo: read ./train.db \n",
    "\n",
    "data_refine = []\n",
    "for sentence in data_train:\n",
    "    data_refine.append(sentence.lower())\n",
    "    #todo: leave out special character\n",
    "\n",
    "data_refine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[frozenset({' has ',\n",
       "            ' nove',\n",
       "            ' so f',\n",
       "            ' two ',\n",
       "            ' writ',\n",
       "            'as wr',\n",
       "            'e has',\n",
       "            'els s',\n",
       "            'en tw',\n",
       "            'has w',\n",
       "            'he ha',\n",
       "            'itten',\n",
       "            'ls so',\n",
       "            'n two',\n",
       "            'novel',\n",
       "            'o far',\n",
       "            'o nov',\n",
       "            'ovels',\n",
       "            'ritte',\n",
       "            's so ',\n",
       "            's wri',\n",
       "            'so fa',\n",
       "            'ten t',\n",
       "            'tten ',\n",
       "            'two n',\n",
       "            'vels ',\n",
       "            'wo no',\n",
       "            'writt'}),\n",
       " frozenset({' been',\n",
       "            ' by h',\n",
       "            ' have',\n",
       "            ' him ',\n",
       "            ' nove',\n",
       "            ' so f',\n",
       "            ' writ',\n",
       "            'ave b',\n",
       "            'been ',\n",
       "            'by hi',\n",
       "            'e bee',\n",
       "            'een w',\n",
       "            'els h',\n",
       "            'en by',\n",
       "            'en wr',\n",
       "            'have ',\n",
       "            'him s',\n",
       "            'im so',\n",
       "            'itten',\n",
       "            'ls ha',\n",
       "            'm so ',\n",
       "            'n by ',\n",
       "            'n wri',\n",
       "            'novel',\n",
       "            'o far',\n",
       "            'o nov',\n",
       "            'ovels',\n",
       "            'ritte',\n",
       "            's hav',\n",
       "            'so fa',\n",
       "            'ten b',\n",
       "            'tten ',\n",
       "            'two n',\n",
       "            've be',\n",
       "            'vels ',\n",
       "            'wo no',\n",
       "            'writt',\n",
       "            'y him'}),\n",
       " frozenset({' plan',\n",
       "            ' ten ',\n",
       "            ' tomo',\n",
       "            ' tree',\n",
       "            ' will',\n",
       "            'ant t',\n",
       "            'ees t',\n",
       "            'en tr',\n",
       "            'es to',\n",
       "            'ey wi',\n",
       "            'hey w',\n",
       "            'ill p',\n",
       "            'l pla',\n",
       "            'lant ',\n",
       "            'll pl',\n",
       "            'morro',\n",
       "            'n tre',\n",
       "            'nt te',\n",
       "            'omorr',\n",
       "            'orrow',\n",
       "            'plant',\n",
       "            'rees ',\n",
       "            's tom',\n",
       "            't ten',\n",
       "            'ten t',\n",
       "            'they ',\n",
       "            'tomor',\n",
       "            'trees',\n",
       "            'will ',\n",
       "            'y wil'}),\n",
       " frozenset({' be p',\n",
       "            ' by t',\n",
       "            ' plan',\n",
       "            ' them',\n",
       "            ' tomo',\n",
       "            ' tree',\n",
       "            ' will',\n",
       "            'anted',\n",
       "            'be pl',\n",
       "            'by th',\n",
       "            'd by ',\n",
       "            'e pla',\n",
       "            'ed by',\n",
       "            'ees w',\n",
       "            'em to',\n",
       "            'en tr',\n",
       "            'es wi',\n",
       "            'hem t',\n",
       "            'ill b',\n",
       "            'l be ',\n",
       "            'lante',\n",
       "            'll be',\n",
       "            'm tom',\n",
       "            'morro',\n",
       "            'n tre',\n",
       "            'nted ',\n",
       "            'omorr',\n",
       "            'orrow',\n",
       "            'plant',\n",
       "            'rees ',\n",
       "            's wil',\n",
       "            'ted b',\n",
       "            'ten t',\n",
       "            'them ',\n",
       "            'tomor',\n",
       "            'trees',\n",
       "            'will ',\n",
       "            'y the'}),\n",
       " frozenset({' a le',\n",
       "            ' ever',\n",
       "            ' lett',\n",
       "            ' week',\n",
       "            ' writ',\n",
       "            'a let',\n",
       "            'bruce',\n",
       "            'ce wr',\n",
       "            'e wri',\n",
       "            'er ev',\n",
       "            'ery w',\n",
       "            'es a ',\n",
       "            'etter',\n",
       "            'every',\n",
       "            'ites ',\n",
       "            'lette',\n",
       "            'r eve',\n",
       "            'rites',\n",
       "            'ruce ',\n",
       "            'ry we',\n",
       "            's a l',\n",
       "            'ter e',\n",
       "            'tes a',\n",
       "            'tter ',\n",
       "            'uce w',\n",
       "            'very ',\n",
       "            'write',\n",
       "            'y wee'})]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shingle_k = 5\n",
    "\n",
    "def shingle_extract(sentence, shingle_k):\n",
    "    \"\"\"extract k elements from sentence step by step.\n",
    "    \n",
    "    \"\"\"\n",
    "    shingle_elements = []\n",
    "    while (len(sentence) >= shingle_k):\n",
    "        element = sentence[0:shingle_k]\n",
    "        if element not in shingle_elements:\n",
    "            shingle_elements.append(element)\n",
    "        sentence = sentence[1:]\n",
    "    return frozenset(shingle_elements)\n",
    "\n",
    "shingle_dict = []\n",
    "for sentence in data_refine:\n",
    "    shingle_dict.append(shingle_extract(sentence, shingle_k))\n",
    "\n",
    "shingle_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def jaccard_similarity_calculate(set_a, set_b):\n",
    "    \"\"\"calculate the Jaccard Similarity of two sets.\n",
    "\n",
    "    Jaccard Similarity: (a n b) / (a u b)\n",
    "    \"\"\"\n",
    "    return np.array(len(set_a.intersection(set_b)), dtype=np.double) / np.array(len(set_a.union(set_b)), dtype=np.double)\n",
    "\n",
    "def combination_calculate(n,k): \n",
    "    \"\"\"calculate the number of all possible k combination of n elements.\n",
    "\n",
    "    combination = n! / (k!)(n-k)!\n",
    "    \"\"\"\n",
    "    import operator  \n",
    "    return  reduce(operator.mul, range(n - k + 1, n + 1)) / reduce(operator.mul, range(1, k +1)) \n",
    "\n",
    "sentence_num = len(shingle_dict)\n",
    "jaccard_similarity_matrix = np.zeros((sentence_num, sentence_num))\n",
    "jaccard_similarity_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}