{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['he has written two novels so far',\n", " 'two novels have been written by him so far',\n", " 'they will plant ten trees tomorrow',\n", " 'ten trees will be planted by them tomorrow',\n", " 'bruce writes a letter every week']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%matplotlib inline\n", "\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "data_train = [\n", " \"He has written two novels so far\",\n", " \"Two novels have been written by him so far\",\n", " \"They will plant ten trees tomorrow\",\n", " \"Ten trees will be planted by them tomorrow\",\n", " \"Bruce writes a letter every week\",\n", "]\n", "#todo: read ./train.db \n", "\n", "data_refine = []\n", "for sentence in data_train:\n", " data_refine.append(sentence.lower())\n", " #todo: leave out special character\n", "\n", "data_refine" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[frozenset({' has ',\n", " ' nove',\n", " ' so f',\n", " ' two ',\n", " ' writ',\n", " 'as wr',\n", " 'e has',\n", " 'els s',\n", " 'en tw',\n", " 'has w',\n", " 'he ha',\n", " 'itten',\n", " 'ls so',\n", " 'n two',\n", " 'novel',\n", " 'o far',\n", " 'o nov',\n", " 'ovels',\n", " 'ritte',\n", " 's so ',\n", " 's wri',\n", " 'so fa',\n", " 'ten t',\n", " 'tten ',\n", " 'two n',\n", " 'vels ',\n", " 'wo no',\n", " 'writt'}),\n", " frozenset({' been',\n", " ' by h',\n", " ' have',\n", " ' him ',\n", " ' nove',\n", " ' so f',\n", " ' writ',\n", " 'ave b',\n", " 'been ',\n", " 'by hi',\n", " 'e bee',\n", " 'een w',\n", " 'els h',\n", " 'en by',\n", " 'en wr',\n", " 'have ',\n", " 'him s',\n", " 'im so',\n", " 'itten',\n", " 'ls ha',\n", " 'm so ',\n", " 'n by ',\n", " 'n wri',\n", " 'novel',\n", " 'o far',\n", " 'o nov',\n", " 'ovels',\n", " 'ritte',\n", " 's hav',\n", " 'so fa',\n", " 'ten b',\n", " 'tten ',\n", " 'two n',\n", " 've be',\n", " 'vels ',\n", " 'wo no',\n", " 'writt',\n", " 'y him'}),\n", " frozenset({' plan',\n", " ' ten ',\n", " ' tomo',\n", " ' tree',\n", " ' will',\n", " 'ant t',\n", " 'ees t',\n", " 'en tr',\n", " 'es to',\n", " 'ey wi',\n", " 'hey w',\n", " 'ill p',\n", " 'l pla',\n", " 'lant ',\n", " 'll pl',\n", " 'morro',\n", " 'n tre',\n", " 'nt te',\n", " 'omorr',\n", " 'orrow',\n", " 'plant',\n", " 'rees ',\n", " 's tom',\n", " 't ten',\n", " 'ten t',\n", " 'they ',\n", " 'tomor',\n", " 'trees',\n", " 'will ',\n", " 'y wil'}),\n", " frozenset({' be p',\n", " ' by t',\n", " ' plan',\n", " ' them',\n", " ' tomo',\n", " ' tree',\n", " ' will',\n", " 'anted',\n", " 'be pl',\n", " 'by th',\n", " 'd by ',\n", " 'e pla',\n", " 'ed by',\n", " 'ees w',\n", " 'em to',\n", " 'en tr',\n", " 'es wi',\n", " 'hem t',\n", " 'ill b',\n", " 'l be ',\n", " 'lante',\n", " 'll be',\n", " 'm tom',\n", " 'morro',\n", " 'n tre',\n", " 'nted ',\n", " 'omorr',\n", " 'orrow',\n", " 'plant',\n", " 'rees ',\n", " 's wil',\n", " 'ted b',\n", " 'ten t',\n", " 'them ',\n", " 'tomor',\n", " 'trees',\n", " 'will ',\n", " 'y the'}),\n", " frozenset({' a le',\n", " ' ever',\n", " ' lett',\n", " ' week',\n", " ' writ',\n", " 'a let',\n", " 'bruce',\n", " 'ce wr',\n", " 'e wri',\n", " 'er ev',\n", " 'ery w',\n", " 'es a ',\n", " 'etter',\n", " 'every',\n", " 'ites ',\n", " 'lette',\n", " 'r eve',\n", " 'rites',\n", " 'ruce ',\n", " 'ry we',\n", " 's a l',\n", " 'ter e',\n", " 'tes a',\n", " 'tter ',\n", " 'uce w',\n", " 'very ',\n", " 'write',\n", " 'y wee'})]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "shingle_k = 5\n", "\n", "def shingle_extract(sentence, shingle_k):\n", " \"\"\"extract k elements from sentence step by step.\n", " \n", " \"\"\"\n", " shingle_elements = []\n", " while (len(sentence) >= shingle_k):\n", " element = sentence[0:shingle_k]\n", " if element not in shingle_elements:\n", " shingle_elements.append(element)\n", " sentence = sentence[1:]\n", " return frozenset(shingle_elements)\n", "\n", "shingle_dict = []\n", "for sentence in data_refine:\n", " shingle_dict.append(shingle_extract(sentence, shingle_k))\n", "\n", "shingle_dict" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0., 0., 0., 0., 0.],\n", " [ 0., 0., 0., 0., 0.],\n", " [ 0., 0., 0., 0., 0.],\n", " [ 0., 0., 0., 0., 0.],\n", " [ 0., 0., 0., 0., 0.]])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def jaccard_similarity_calculate(set_a, set_b):\n", " \"\"\"calculate the Jaccard Similarity of two sets.\n", "\n", " Jaccard Similarity: (a n b) / (a u b)\n", " \"\"\"\n", " return np.array(len(set_a.intersection(set_b)), dtype=np.double) / np.array(len(set_a.union(set_b)), dtype=np.double)\n", "\n", "def combination_calculate(n,k): \n", " \"\"\"calculate the number of all possible k combination of n elements.\n", "\n", " combination = n! / (k!)(n-k)!\n", " \"\"\"\n", " import operator \n", " return reduce(operator.mul, range(n - k + 1, n + 1)) / reduce(operator.mul, range(1, k +1)) \n", "\n", "sentence_num = len(shingle_dict)\n", "jaccard_similarity_matrix = np.zeros((sentence_num, sentence_num))\n", "jaccard_similarity_matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for " ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }