{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Required libraries and setup" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!pip install python-Levenshtein\n", "!pip install distance" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "__author__ = 'j'\n", "import distance\n", "from sklearn import svm, linear_model\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import roc_curve, auc,f1_score\n", "import Levenshtein as lev\n", "import numpy as np\n", "import pandas as pd\n", "import pylab as plt\n", "import seaborn as sns\n", "%matplotlib inline\n", "# These are the \"Tableau 20\" colors as RGB.\n", "tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),\n", " (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),\n", " (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),\n", " (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),\n", " (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]\n", "\n", "# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.\n", "for i in range(len(tableau20)):\n", " r, g, b = tableau20[i]\n", " tableau20[i] = (r / 255., g / 255., b / 255.)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test and train set (required files)\n", "There are two databases of movies: **\"./D/database_1.csv\"** and **\"./D/database_2.csv\"**\n", "\n", "From there I created small train and test sets by hand: **\"./D/train.csv\"**, **\"./D/test.csv\"**" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mkdir: cannot create directory ā€˜Dā€™: File exists\r\n" ] } ], "source": [ "!mkdir D" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# All distance functions required" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def jaccard(set_1, set_2):\n", " \"\"\"\n", " :param set_1: set of characters string 1\n", " :param set_2: set of characters string 2\n", " :return: jaccard distance\n", " \"\"\"\n", " n = len(set_1.intersection(set_2))\n", " return n / float(len(set_1) + len(set_2) - n)\n", "\n", "def dice_coefficient(a,b,lenGram=2):\n", " \"\"\"\n", " :param a: string 1\n", " :param b: string 2\n", " :param lenGram: length of the n-grams\n", " :return: dice score\n", "\n", " From Rossetta code\n", " \"\"\"\n", " if not len(a) or not len(b): return 0.0\n", " \"\"\" quick case for true duplicates \"\"\"\n", " if a == b: return 1.0\n", " \"\"\" if a != b, and a or b are single chars, then they can't possibly match \"\"\"\n", " if len(a) == 1 or len(b) == 1: return 0.0\n", "\n", " \"\"\" use python list comprehension, preferred over list.append() \"\"\"\n", " a_bigram_list = [a[i:i+lenGram] for i in range(len(a)-1)]\n", " b_bigram_list = [b[i:i+lenGram] for i in range(len(b)-1)]\n", "\n", " a_bigram_list.sort()\n", " b_bigram_list.sort()\n", "\n", " # assignments to save function calls\n", " lena = len(a_bigram_list)\n", " lenb = len(b_bigram_list)\n", " # initialize match counters\n", " matches = i = j = 0\n", " while (i < lena and j < lenb):\n", " if a_bigram_list[i] == b_bigram_list[j]:\n", " matches += lenGram\n", " i += 1\n", " j += 1\n", " elif a_bigram_list[i] < b_bigram_list[j]:\n", " i += 1\n", " else:\n", " j += 1\n", "\n", " score = float(matches)/float(lena + lenb)\n", " return score\n", "\n", "\n", "def cosineBigrams(a,b,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram=3):\n", " \"\"\"\n", " :param a: string 1\n", " :param b: string 2\n", " :param dictTrainBigrams: Dictionary of bigrams to find index quickly\n", " :param tfidf_matrix_trainBigrams: Weigths of bigrrams\n", " :param lenGram: Length of n-grams (3)\n", " :return: cosine similarity (angle between vectors)\n", " \"\"\"\n", " a = a.lower().rstrip()\n", " b = b.lower().rstrip()\n", " st1 = ' '.join([elem for elem in [a[i:i+lenGram] for i in range(len(a)-1)] if len(elem) == lenGram])\n", " st2 = ' '.join([elem for elem in [b[i:i+lenGram] for i in range(len(b)-1)] if len(elem) == lenGram])\n", " ind_a = dictTrainBigrams[st1]\n", " ind_b = dictTrainBigrams[st2]\n", " score = cosine_similarity(tfidf_matrix_trainBigrams[ind_a:ind_a+1], tfidf_matrix_trainBigrams[ind_b:ind_b+1])\n", " return score\n", "\n", "def cosineWords(a,b,dictTrain,tfidf_matrix_train):\n", " \"\"\"\n", " :param a: string 1\n", " :param b: string 2\n", " :param dictTrain: Dictionary of wors to find index quickly\n", " :param tfidf_matrix_train: Weights of words\n", " :return: cosine similarity (angle between vectors)\n", " \"\"\"\n", " ind_a = dictTrain[a.lower().rstrip()]\n", " ind_b = dictTrain[b.lower().rstrip()]\n", " score = cosine_similarity(tfidf_matrix_train[ind_a:ind_a+1], tfidf_matrix_train[ind_b:ind_b+1])\n", " return score\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create TF-IDF" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def read_words(database1,database2):\n", " ## Bag of words\n", " with open(database1) as f:\n", " train_set1 = [line.lower().rstrip() for line in f]\n", " with open(database2) as f:\n", " train_set2 = [line.lower().rstrip() for line in f]\n", "\n", " train_set = sorted(list(set(train_set1 + train_set2)))\n", " return train_set\n", "\n", "def create_trigrams(train_set,lenGram):\n", " train_setNgrams = []\n", " for mov in train_set:\n", " temp = [mov[i:i+lenGram] for i in range(len(mov)-1)]\n", " temp = [elem for elem in temp if len(elem) == lenGram]\n", " train_setNgrams.append(' '.join(temp))\n", "\n", " train_setNgrams = sorted(list(set(train_setNgrams)))\n", " \n", " return train_setNgrams\n", "\n", "def createTFIDF(database1,database2,lenGram = 3):\n", " ## Words\n", " train_set = read_words(database1,database2)\n", "\n", " # Create dictionary to find movie faster and find weights\n", " dictTrain = dict(zip(train_set,range(len(train_set))))\n", " tfidf_vectorizer = TfidfVectorizer()\n", " tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)\n", "\n", " ## Tri-grams\n", " train_setNgrams = create_trigrams(train_set,lenGram)\n", " \n", " # Create dictionary to find movie faster and find weights\n", " dictTrainNgrams = dict(zip(train_setNgrams,range(len(train_setNgrams))))\n", " tfidf_vectorizerNgrams = TfidfVectorizer()\n", " tfidf_matrix_trainNgrams = tfidf_vectorizerNgrams.fit_transform(train_setNgrams)\n", "\n", " return [tfidf_matrix_train,dictTrain,tfidf_matrix_trainNgrams,dictTrainNgrams,lenGram]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "database1 = \"./D/database_1.csv\"\n", "database2 = \"./D/database_2.csv\"\n", "tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram = createTFIDF(database1,database2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train (find all distances)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def find_distances(st1,st2):\n", " \"\"\"\n", " Find distances between two strings\n", " Types of distances: \n", " \"Lev\",\"Jaro\",\"Jaro-Winkler\",\"Ratio\",\"Sorensen\",\"Jaccard\",\"Lev1\",\"Lev2\",\"Dice_2\",\"Dice_3\",\"Dice_4\",\"cosineWords\",\"cosineBigrams\"\n", " \"\"\"\n", " \n", " return [1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),\n", " lev.jaro(st1,st2),\n", " lev.jaro_winkler(st1,st2),\n", " lev.ratio(st1,st2),\n", " distance.sorensen(st1,st2),\n", " jaccard(set(st1),set(st2)),\n", " 1. - distance.nlevenshtein(st1,st2,method=1),\n", " 1. - distance.nlevenshtein(st1,st2,method=2),\n", " dice_coefficient(st1,st2,lenGram=2),\n", " dice_coefficient(st1,st2,lenGram=3),\n", " dice_coefficient(st1,st2,lenGram=4),\n", " cosineWords(st1,st2,dictTrain,tfidf_matrix_train),\n", " cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram)]\n", "\n", "def train(train_data_file,tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,sep=\"\\t\"):\n", " \"\"\"\n", " find distances and train svm and logit prediction with the train data\n", " \"\"\"\n", " allTrainX = list()\n", " allTrainY = list()\n", " with open(train_data_file) as f:\n", " for line in f:\n", " lin = line.split(sep)\n", "\n", " st1 = lin[0].lower()\n", " st2 = lin[1].lower()\n", "\n", " temp = find_distances(st1,st2)\n", "\n", " allTrainX.append(temp)\n", " allTrainY.append(int(lin[2]))\n", "\n", "\n", " X = np.array(allTrainX,dtype=float)\n", " y = np.array(allTrainY,dtype=float)\n", " clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1')\n", " clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1')\n", " clf.fit(X, y)\n", " clf2.fit(X, y)\n", " weights = np.array(clf.coef_[0])\n", " print(weights)\n", " weights = np.array(clf2.coef_[0])\n", " print(weights)\n", "\n", "\n", " return clf,clf2" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[-2.55728062 2.27080268 -0.90387197 0. 0. -0.06514155\n", " 0. 0. 3.87674783 0.32821677 0. 0.88335231\n", " 0.23681376]\n", "[-2.22705824 0. 0. 0. 0. 0. 0.\n", " 0. 3.83517386 2.11304333 0. 2.97997449 1.21442765]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/shared/anaconda3/lib/python3.5/site-packages/sklearn/svm/classes.py:219: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0\n", " DeprecationWarning)\n" ] } ], "source": [ "train_data_file = \"./D/train.csv\" #must be a tab separated file\n", "clf,clf2 = train(train_data_file,tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,sep=\"\\t\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test and plot results (requires test file)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def test(test_data_file,tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,clf1,clf2,sep=\"\\t\"):\n", " \"\"\"\n", " find distances, svm and logit prediction for the test data\n", " \"\"\"\n", " with open(test_data_file) as infile:\n", " for i,line in enumerate(infile):\n", " pass\n", "\n", " dimMatrix = 16\n", " predict = np.zeros((i+1,dimMatrix))\n", "\n", " with open(test_data_file) as infile:\n", " for i,line in enumerate(infile):\n", " lin = line.rstrip().split(sep)\n", "\n", " ## create same vector with more distances\n", " st1 = lin[0].lower()\n", " st2 = lin[1].lower()\n", "\n", " temp = find_distances(st1,st2)\n", " temp = np.array(temp,dtype=float).reshape(1,-1)\n", " \n", " predict[i,:-3] = temp\n", " #Predict SVM\n", " predict[i,-3] = clf1.decision_function(temp)\n", " #Predict Logit\n", " predict[i,-2] = clf2.decision_function(temp)\n", " #Real\n", " predict[i,-1] = lin[-1]\n", "\n", " return predict\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(action='once')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "test_data_file = \"./D/test.csv\"\n", "predict = test(test_data_file,tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,clf,clf2,sep=\"\\t\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Plot results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def barplot(x,y,xlabel,ylabel,xticks):\n", " fig = plt.figure(figsize=(9,6))\n", " ax = fig.add_subplot(111)\n", " ax.bar(range(x),y)\n", " \n", " plt.xticks(np.arange(x)+0.5,xticks,rotation=45)\n", " ax.set_ylabel(ylabel)\n", " ax.set_xlabel(xlabel)\n", " plt.legend(loc=2)\n", " plt.show()\n", " \n", "def plot(predict):\n", " \"\"\"\n", " Plot the results based on predict (last column real, other columns as in find_distances + svm + logit )\n", " \"\"\"\n", " labelsM = [\"Lev\",\"Jaro\",\"Jaro-Winkler\",\"Ratio\",\"Sorensen\",\"Jaccard\",\"Lev1\",\"Lev2\",\"Dice_2\",\"Dice_3\",\"Dice_4\",\"cosineWords\",\"cosineBigrams\",\"SVM\",\"Logit\"]\n", " dimMatrix = len(labelsM)\n", " \n", " \n", " f1matrix = np.zeros((100,dimMatrix))\n", "\n", " iC = -1\n", " for i in np.linspace(0,1,100):\n", " iC += 1\n", " for j in range(dimMatrix):\n", " t = np.array(predict[:,j])\n", " if j >= dimMatrix-2:\n", " t = (t - np.min(t))/(np.max(t)-np.min(t))\n", " f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1])\n", " \n", " F1scores = np.max(f1matrix,axis=0)\n", " barplot(dimMatrix,F1scores,xlabel=\"Parameter\",ylabel=\"F1 score\",xticks=labelsM)\n", " \n", " fig = plt.figure(figsize=(9,6))\n", " ax = fig.add_subplot(111)\n", " AUCScores = []\n", " for j in range(dimMatrix):\n", " # Compute ROC curve and area the curve\n", " fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j])\n", " AUCScores.append(auc(fpr, tpr))\n", "\n", " # Plot ROC curve\n", " ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j])\n", " ax.plot([0, 1], [0, 1], 'k--')\n", " plt.xlim([0.0, 1.0])\n", " plt.ylim([0.0, 1.0])\n", " ax.set_xlabel('False Positive Rate')\n", " ax.set_ylabel('True Positive Rate')\n", " ax.set_title('ROC Curve')\n", "\n", " plt.legend(loc=2)\n", "\n", " plt.show()\n", "\n", " \n", " barplot(dimMatrix,AUCScores,xlabel=\"Parameter\",ylabel=\"Area Under Curve\",xticks=labelsM)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "\n", "plot(predict)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }