{
  "cells": [
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a",
        "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
        "trusted": false
      },
      "cell_type": "code",
      "source": "import numpy as np\nimport pandas as pd",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "2c308efde1fe4889500ee5c49df7145c29ba7fd4",
        "_cell_guid": "00abb666-e575-4785-a1a3-02bb58590df7",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "!ls ../input",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "d44fe8e6b84bce96399f2435583b6feec0b7cd0d",
        "_cell_guid": "39064b57-38ce-4310-a904-fbbfbde53e47",
        "trusted": false
      },
      "cell_type": "code",
      "source": "import codecs\ninput_file = codecs.open('../input/disasters-on-social-media/socialmedia-disaster-tweets-DFE.csv', \n                         'r',\n                         encoding='utf-8', \n                         errors='replace')\noutput_file = open('clean_socialmedia-disaster.csv', 'w')\n    \nfor line in input_file:\n    out = line\n    output_file.write(line)\n",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "6a630a757e8576000f55dfa9d47bb8588fe8b4f8",
        "_cell_guid": "9c5a237c-53ec-4658-9361-cbd4f8264545",
        "trusted": false
      },
      "cell_type": "code",
      "source": "input_file.close()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "e62d9d3b7ab866b365a5e559c41166256e6b5a2f",
        "_cell_guid": "44956c28-a0f2-44c5-8357-980f8af0c0f5",
        "trusted": false
      },
      "cell_type": "code",
      "source": "output_file.close()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "cc4d7c47dead195f110c14d6d5d69846519f6a48",
        "_cell_guid": "65d3add6-d004-4f0d-8d76-a8911f9c750d",
        "trusted": false
      },
      "cell_type": "code",
      "source": "df = pd.read_csv('clean_socialmedia-disaster.csv')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "2364d37d3b1f189988ce40f0017d44846615f607",
        "_cell_guid": "21f03b7a-959e-4aca-a77e-ef79b0b34b40",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.head()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "2e523f8e869affd075fbe49c29979cf0faef01d5",
        "_cell_guid": "d60abee3-1dab-4f88-8000-04654bb49ef0",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.shape",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "9eb71e33dbc95d5645e10f80881637fea0fd9c23",
        "_cell_guid": "12fc0a9f-4fc4-49c4-90ea-5795344e9a34",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.choose_one.unique()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "3a28f1f7bc1997b61b06f3c002736273eb70a5d6",
        "_cell_guid": "898d4317-d0a1-4352-a914-bb0644e5e476",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df['choose_one'].value_counts().plot(kind='bar')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "1692c9ca79db9937ea3e69426af257b4f045e59b",
        "_cell_guid": "8068263e-b24f-429d-9d46-0b75bc092b4d",
        "trusted": false
      },
      "cell_type": "code",
      "source": "df = df[df.choose_one != \"Can't Decide\"]",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "802af3ed901ad8ecd4fd52a27c2723ec697cf883",
        "_cell_guid": "6193dd41-7d41-4942-87cc-129264c260b0",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.shape",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "7859dea634be4d45e8db7d0a553f88033670a2bf",
        "_cell_guid": "27e9dcc2-0a2c-44d8-8c76-02ca6e78cae9",
        "trusted": false
      },
      "cell_type": "code",
      "source": "df = df[['text','choose_one']]",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "3ec9b954eb4a7a96be0cc84afa72bb7ccee79680",
        "_cell_guid": "c2035dbe-a0d1-4de1-b2e3-64b19f373480",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.head()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "addb57d580d894cc3f2d82e59734f7df334770e6",
        "_cell_guid": "73a35c46-9148-43ca-baa2-9a1ed99b41e9",
        "trusted": false
      },
      "cell_type": "code",
      "source": "df['relevant'] = df.choose_one.map({'Relevant':1,'Not Relevant':0})",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "4e177724a2f53bd71c5062074eb3fa5482a9b545",
        "_cell_guid": "4808d1d5-3072-4cfd-8568-174794b105ae",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.head()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "683ce9a5cdaab3f4028690fbf9419a3a3be94457",
        "_cell_guid": "6e9cb347-8e9f-40e9-aa4b-ddd396f67dc1",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.describe()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "4811145728d79c4cdcf432910cf5188d392a918a",
        "_cell_guid": "de7a666e-abea-4d8c-96cc-f0487363c620"
      },
      "cell_type": "markdown",
      "source": "# Lemmatization"
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "413fbc49a7669a9b2f229b2331badc96242e4db5",
        "_cell_guid": "3a606a1e-cc2b-4ece-8049-fc314b8706cb",
        "trusted": false
      },
      "cell_type": "code",
      "source": "import spacy",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "f186bdd90856bdd20ed500e28a85e9576648155a",
        "_cell_guid": "7f7a2b1b-3e89-4272-834b-fae5eb7da830",
        "trusted": false
      },
      "cell_type": "code",
      "source": "nlp = spacy.load('en',disable=['tagger','parser','ner'])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "c465fe8007f47352b0ff6d5540b69bdf401e3464",
        "_cell_guid": "cf5b018d-ac0c-4a74-97f1-d9e85ebd5f3a",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from tqdm import tqdm, tqdm_notebook\ntqdm.pandas(tqdm_notebook)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "5f6c14cc863360f6233d89d90528886b97dd790d",
        "_cell_guid": "08527279-9f8b-4739-93f6-84678551db45",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df['lemmas'] = df[\"text\"].progress_apply(lambda row: \n                                         [w.lemma_ for w in nlp(row)])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "65fbb1885819974193c8df569681f8dd3962aef8",
        "_cell_guid": "5429aea8-6066-4a3d-a129-560e71330c49",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.head()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "64337904344f9a66eadbd17245fa2c51c168ee5d",
        "_cell_guid": "1053aa84-fddf-47e4-82a6-d18e41aba506",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df['joint_lemmas'] = df['lemmas'].progress_apply(lambda row: ' '.join(row))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "22eb152feddf5fc5c3c4f8f25475278214f03d0c",
        "_cell_guid": "1d39c48f-c4d8-4ed2-a353-ba5c8b67d28a",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "df.head()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "815ea14477cf9059329bd11e16f02f5fc93e6960",
        "_cell_guid": "564d3ec1-1095-4462-a7e7-e976acee1bae"
      },
      "cell_type": "markdown",
      "source": "# Bag of words"
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "b3ec80840bc0edb7e4829008c365e242065729cf",
        "_cell_guid": "2c8f1734-740f-48f6-931e-311d092e8b7a",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(df['joint_lemmas'], \n                                                    df['relevant'], \n                                                    test_size=0.2,\n                                                    random_state=40)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "ca726056dc4ba9758407c3755dc667ed2eee122b",
        "_cell_guid": "c7be9c52-4642-4b17-8b1c-855a4ccc6121",
        "trusted": false
      },
      "cell_type": "code",
      "source": "\nfrom sklearn.feature_extraction.text import CountVectorizer\n\ncount_vectorizer = CountVectorizer(max_features=5000)\nX_train_counts = count_vectorizer.fit_transform(X_train)\nX_test_counts = count_vectorizer.transform(X_test)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "a5603f2985bda0442b5a19eeddf274a8757cf16d",
        "_cell_guid": "0696dc01-a0b5-48f3-a4c7-a48f9f029b5f",
        "trusted": false
      },
      "cell_type": "code",
      "source": "X_train_counts.shape",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "0d241b3b7627f8c4fc61c68930f1f98e38a6af4d",
        "_cell_guid": "369f85ff-b263-4fbf-b76a-f967742c4bcc",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.decomposition import TruncatedSVD\nimport matplotlib.pyplot as plt\nimport matplotlib.patches as mpatches\nimport matplotlib\n\n \n\nlsa = TruncatedSVD(n_components=2)\nlsa.fit(X_train_counts)\nlsa_scores = lsa.transform(X_train_counts)\n\n\nfig = plt.figure(figsize=(16, 16))   \ncolors = ['orange','blue']\n\nplt.scatter(lsa_scores[:,0], \n            lsa_scores[:,1], \n            s=8, alpha=.8, \n            c=y_train,\n            cmap=matplotlib.colors.ListedColormap(colors))\n\nir_patch = mpatches.Patch(color='Orange',label='Irrelevant')\n\ndis_patch = mpatches.Patch(color='Blue',label='Disaster')\n\nplt.legend(handles=[ir_patch, dis_patch], prop={'size': 30})\n\nplt.show()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "c81a3571b9b9f99e03578280fde240f777e284e4",
        "_cell_guid": "72371d13-e2ae-4639-a910-adb436b3ff41",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nclf = LogisticRegression()\n\nclf.fit(X_train_counts, y_train)\n\ny_predicted = clf.predict(X_test_counts)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "5ad6fc4281ec6d3b870ff548dab0f5bbbee5681f",
        "_cell_guid": "cf4cd8ef-b053-4454-924c-e8e14309d8a1",
        "trusted": false
      },
      "cell_type": "code",
      "source": "accuracy_score(y_test, y_predicted)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "88fac49aad6ededc62b7c5ebc5519547b94b3bab",
        "_cell_guid": "0615a89f-b61a-40f3-9fdf-fbac04d70886",
        "trusted": false
      },
      "cell_type": "code",
      "source": "import numpy as np\nimport itertools\nfrom sklearn.metrics import confusion_matrix\n\ndef plot_confusion_matrix(cm, classes,\n                          normalize=False,\n                          title='Confusion matrix',\n                          cmap=plt.cm.winter):\n    if normalize:\n        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n    plt.title(title, fontsize=30)\n    plt.colorbar()\n    tick_marks = np.arange(len(classes))\n    plt.xticks(tick_marks, classes, fontsize=20)\n    plt.yticks(tick_marks, classes, fontsize=20)\n    \n    fmt = '.2f' if normalize else 'd'\n    thresh = cm.max() / 2.\n\n    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment=\"center\", \n                 color=\"white\" if cm[i, j] < thresh else \"black\", fontsize=40)\n    \n    plt.tight_layout()\n    plt.ylabel('True label', fontsize=30)\n    plt.xlabel('Predicted label', fontsize=30)\n\n    return plt",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "01be8be6a0d160ad59da17553d556d3bf7ae3a36",
        "_cell_guid": "980c484f-5994-4fd0-9009-25dd620f002a",
        "trusted": false
      },
      "cell_type": "code",
      "source": "cm = confusion_matrix(y_test, y_predicted)\nfig = plt.figure(figsize=(10, 10))\nplot = plot_confusion_matrix(cm, classes=['Irrelevant','Disaster'], normalize=False, title='Confusion matrix')\nplt.show()\nprint(cm)\n",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "230f3949f4cb5713a44121f87939a684c06d3789",
        "_cell_guid": "a3ecd467-9ae0-4097-bce2-07876c8f05bf",
        "trusted": false
      },
      "cell_type": "code",
      "source": "def get_most_important_features(vectorizer, model, n=5):\n    index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}\n    \n    # loop for each class\n    classes ={}\n    for class_index in range(model.coef_.shape[0]):\n        word_importances = [(el, index_to_word[i]) for i,el in enumerate(model.coef_[class_index])]\n        sorted_coeff = sorted(word_importances, key = lambda x : x[0], reverse=True)\n        tops = sorted(sorted_coeff[:n], key = lambda x : x[0])\n        bottom = sorted_coeff[-n:]\n        classes[class_index] = {\n            'tops':tops,\n            'bottom':bottom\n        }\n    return classes\n\nimportance = get_most_important_features(count_vectorizer, clf, 10)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "bda2d90fd49c25e9ac7885fc7ccbd3b0844492e9",
        "_cell_guid": "45e37db0-3cf1-4dd5-801a-fadc08815bf7",
        "trusted": false
      },
      "cell_type": "code",
      "source": "def plot_important_words(top_scores, top_words, bottom_scores, bottom_words, name):\n    y_pos = np.arange(len(top_words))\n    top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]\n    top_pairs = sorted(top_pairs, key=lambda x: x[1])\n    \n    bottom_pairs = [(a,b) for a,b in zip(bottom_words, bottom_scores)]\n    bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)\n    \n    top_words = [a[0] for a in top_pairs]\n    top_scores = [a[1] for a in top_pairs]\n    \n    bottom_words = [a[0] for a in bottom_pairs]\n    bottom_scores = [a[1] for a in bottom_pairs]\n    \n    fig = plt.figure(figsize=(10, 10))  \n\n    plt.subplot(121)\n    plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)\n    plt.title('Irrelevant', fontsize=20)\n    plt.yticks(y_pos, bottom_words, fontsize=14)\n    plt.suptitle('Key words', fontsize=16)\n    plt.xlabel('Importance', fontsize=20)\n    \n    plt.subplot(122)\n    plt.barh(y_pos,top_scores, align='center', alpha=0.5)\n    plt.title('Disaster', fontsize=20)\n    plt.yticks(y_pos, top_words, fontsize=14)\n    plt.suptitle(name, fontsize=16)\n    plt.xlabel('Importance', fontsize=20)\n    \n    plt.subplots_adjust(wspace=0.8)\n    plt.show()\n\ntop_scores = [a[0] for a in importance[0]['tops']]\ntop_words = [a[1] for a in importance[0]['tops']]\nbottom_scores = [a[0] for a in importance[0]['bottom']]\nbottom_words = [a[1] for a in importance[0]['bottom']]\n\nplot_important_words(top_scores, top_words, bottom_scores, bottom_words, \"Most important words for relevance\")",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "357989b0237da7a84778df374ebcd5e876724c87",
        "_cell_guid": "9d1217cb-172f-46b4-92e5-0fa73f8a9208",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.feature_extraction.text import TfidfVectorizer\ntfidf_vectorizer = TfidfVectorizer(max_features=10000)\n\nX_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\nX_test_tfidf = tfidf_vectorizer.transform(X_test)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "c7c828806f3408d5de847b264adcb9789730a8b9",
        "_cell_guid": "aa0ac1e0-e6fa-47a9-a439-542119c07a46",
        "trusted": false
      },
      "cell_type": "code",
      "source": "lsa = TruncatedSVD(n_components=2)\nlsa.fit(X_train_tfidf)\nlsa_scores = lsa.transform(X_train_tfidf)\n\nfig = plt.figure(figsize=(16, 16))          \ncolors = ['orange','blue']\n\nplt.scatter(lsa_scores[:,0], \n            lsa_scores[:,1], \n            s=8, alpha=.8, \n            c=y_train,\n            cmap=matplotlib.colors.ListedColormap(colors))\n\nir_patch = mpatches.Patch(color='Orange',label='Irrelevant')\n\ndis_patch = mpatches.Patch(color='Blue',label='Disaster')\n\nplt.legend(handles=[ir_patch, dis_patch], prop={'size': 30})\nplt.show()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "89524e9b6f5d4280208e5b980c9f1dae84047c39",
        "_cell_guid": "51b1dc95-e052-41c2-a9dc-a67a3980e7f7",
        "trusted": false
      },
      "cell_type": "code",
      "source": "clf_tfidf = LogisticRegression()\nclf_tfidf.fit(X_train_tfidf, y_train)\n\ny_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "7bbea19aca4c7e58db14dc3c0eb8ac771a35f409",
        "_cell_guid": "c098db90-271f-41e1-a6c4-30daddb8e237",
        "trusted": false
      },
      "cell_type": "code",
      "source": "accuracy_score(y_pred=y_predicted_tfidf, y_true=y_test)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "2d01271de40c59b6e20aa3dd28c3082da134481d",
        "_cell_guid": "c5d744cd-d7f6-403a-ad11-cd1c2e46bf49",
        "trusted": false
      },
      "cell_type": "code",
      "source": "cm2 = confusion_matrix(y_test, y_predicted_tfidf)\nfig = plt.figure(figsize=(10, 10))\nplot = plot_confusion_matrix(cm2, classes=['Irrelevant','Disaster'], normalize=False, title='Confusion matrix')\nplt.show()\nprint(\"TFIDF confusion matrix\")\nprint(cm2)\nprint(\"BoW confusion matrix\")\nprint(cm)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "c7a959e2320b35442649e5ff5648f77e32d47f13",
        "_cell_guid": "e73f1bfd-cfec-4e58-91f3-6049117f8faf",
        "trusted": false
      },
      "cell_type": "code",
      "source": "importance_tfidf = get_most_important_features(tfidf_vectorizer, clf_tfidf, 10)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "4bb515eb1aab6925437de2b802d800415dfbd126",
        "_cell_guid": "43d315db-1671-4b9a-9e50-cfafb39a21b0",
        "trusted": false
      },
      "cell_type": "code",
      "source": "top_scores = [a[0] for a in importance_tfidf[0]['tops']]\ntop_words = [a[1] for a in importance_tfidf[0]['tops']]\nbottom_scores = [a[0] for a in importance_tfidf[0]['bottom']]\nbottom_words = [a[1] for a in importance_tfidf[0]['bottom']]\n\nplot_important_words(top_scores, top_words, bottom_scores, bottom_words, \"Most important words for relevance\")",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "74e0f605a2ee13f284b907a9f43ccdb453cb25a4",
        "_cell_guid": "39e2fe8b-ee17-478c-8d2b-42528e7f5f2f"
      },
      "cell_type": "markdown",
      "source": "# Word vectors"
    },
    {
      "metadata": {
        "_uuid": "68fa9370a74ec5f244facd5576fe37a93561d018",
        "_cell_guid": "1202bc67-e061-4c3f-98a2-9f5ec2b88e57",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "from keras.preprocessing.text import Tokenizer\nimport numpy as np\n\nmax_words = 10000 # We will only consider the 10K most used words in this dataset",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "7f83c302c2adf3b8273216270fc94f2d7f5c23d6",
        "_cell_guid": "9a400915-29d5-40dc-b1bb-312d592d3a02",
        "trusted": false
      },
      "cell_type": "code",
      "source": "tokenizer = Tokenizer(num_words=max_words) # Setup\ntokenizer.fit_on_texts(df['joint_lemmas']) # Generate tokens by counting frequency\nsequences = tokenizer.texts_to_sequences(df['joint_lemmas']) # Turn text into sequence of numbers",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "114941933a81fb2f884273f217118747ceba84e8",
        "_cell_guid": "a2db7532-e2bb-4e9d-9adc-6ca235545a05",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "word_index = tokenizer.word_index\nprint('Token for \"the\"',word_index['the'])\nprint('Token for \"Movie\"',word_index['movie'])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "29309b99def8c73987ef283a53b22d04b572118b",
        "_cell_guid": "7d75d768-73fa-4262-95ca-bf3dd7e49697",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "from keras.preprocessing.sequence import pad_sequences\nmaxlen = 140 # Make all sequences 140 words long\ndata = pad_sequences(sequences, maxlen=maxlen)\nprint(data.shape) # We have 25K, 140 word sequences now",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "9e5b633e923bc737f29f74dd161b47d0888c8cc1",
        "_cell_guid": "79062d91-7f2f-4034-a33f-3ccf4599f722",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(data,\n                                                    df['relevant'],\n                                                    test_size = 0.2, \n                                                    shuffle=True, \n                                                    random_state = 42)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "ef5f17fbf27f8d306df584c0cf2e44c12910520d",
        "_cell_guid": "264a62b5-0595-4897-8fec-b3a107231edb"
      },
      "cell_type": "markdown",
      "source": "## Training custom word vectors"
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "c2e32a607e78fea752873cd4be5b0d53bc6d04bc",
        "_cell_guid": "0e86715b-579e-4053-b1d6-42a503b8603b",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from keras.models import Sequential\nfrom keras.layers import Embedding, Flatten, Dense\n\nembedding_dim = 50\n\nmodel = Sequential()\nmodel.add(Embedding(max_words, embedding_dim, input_length=maxlen))\nmodel.add(Flatten())\n#model.add(Dense(32, activation='relu'))\nmodel.add(Dense(1, activation='sigmoid'))\nmodel.summary()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "719d70a8010e21f8d45795cfa75f299e12524a4a",
        "_cell_guid": "968a9a27-7c29-4283-870c-a17287975c6b",
        "trusted": false
      },
      "cell_type": "code",
      "source": "model.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['acc'])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "c455dfa55385cc0b11544aeb589452d33801919a",
        "_cell_guid": "8d8295af-14b6-47ed-8b47-13b15110b191",
        "trusted": false
      },
      "cell_type": "code",
      "source": "type(data)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "778e0c87f6e4236d71184cd1cb764ee63ef5d843",
        "_cell_guid": "e234982b-6248-4278-8240-b3cb07cc676c",
        "trusted": false
      },
      "cell_type": "code",
      "source": "history = model.fit(X_train, y_train,\n                    epochs=10,\n                    batch_size=32,\n                    validation_data=(X_test, y_test))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "42b2196a8328a4fd972332cbc0a8b9d5aaf60dbb",
        "_cell_guid": "89923384-9de0-4073-9f5d-2426776c3161",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from keras.models import Sequential\nfrom keras.layers import Embedding, LSTM, Dense\n\nembedding_dim = 50\n\nmodel = Sequential()\nmodel.add(Embedding(max_words, embedding_dim, input_length=maxlen))\nmodel.add(LSTM(32))\nmodel.add(Dense(1, activation='sigmoid'))\nmodel.summary()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "a428ad420748e3d514005eb5b5e4b4db535b16f2",
        "_cell_guid": "34d87334-e8ce-46df-bd82-51307b25e9a7",
        "trusted": false
      },
      "cell_type": "code",
      "source": "model.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['acc'])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "b941198ba2f5e2cba8b795c721ea76782426b186",
        "_cell_guid": "8748ea11-aaf2-4285-bd25-c438dc92979a",
        "trusted": false
      },
      "cell_type": "code",
      "source": "history = model.fit(X_train, y_train,\n                    epochs=10,\n                    batch_size=32,\n                    validation_data=(X_test, y_test))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "245f156cc293e53f52ab3ac626e96dcde625b33a",
        "_cell_guid": "8527ecb6-3e88-41c8-a9d5-2f0c220ab335"
      },
      "cell_type": "markdown",
      "source": "## Using pre trained word vectors"
    },
    {
      "metadata": {
        "_uuid": "979b2a21489dbd72f877e904ba6337a685e9b3f9",
        "_cell_guid": "dbfeeec9-22c3-4e86-afbb-d2dad9e934db",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "!ls ../input/glove6b50d",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "9c0ffe0d969ffe2ea50e899b0c3f7834616388a4",
        "_cell_guid": "a62a2619-7002-4394-8f19-c800b6c856f2",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "import os\nglove_dir = '../input/glove6b50d' # This is the folder with the dataset\n\nembeddings_index = {} # We create a dictionary of word -> embedding\nf = open(os.path.join(glove_dir, 'glove.6B.50d.txt')) # Open file\n\n# In the dataset, each line represents a new word embedding\n# The line starts with the word and the embedding values follow\nfor line in f:\n    values = line.split()\n    word = values[0] # The first value is the word, the rest are the values of the embedding\n    embedding = np.asarray(values[1:], dtype='float32') # Load embedding\n    embeddings_index[word] = embedding # Add embedding to our embedding dictionary\nf.close()\n\nprint('Found %s word vectors.' % len(embeddings_index))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "d478c9191174fbcae98d2a3d5715f6f8059eecc2",
        "_cell_guid": "0329b8bd-af00-4529-916f-272c4d3d9bf8",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "all_embs = np.stack(embeddings_index.values())\nemb_mean = all_embs.mean() # Calculate mean\nemb_std = all_embs.std() # Calculate standard deviation\nemb_mean,emb_std",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "9255b95a0476cb3513b6d3f7720a3e26dd376a7a",
        "_cell_guid": "1a6293dc-529a-4a5c-bbef-ce6a65674b73",
        "trusted": false
      },
      "cell_type": "code",
      "source": "embedding_dim = 50\n\nword_index = tokenizer.word_index\nnb_words = min(max_words, len(word_index)) # How many words are there actually\n\n# Create a random matrix with the same mean and std as the embeddings\nembedding_matrix = np.random.normal(emb_mean, \n                                    emb_std, \n                                    (nb_words, embedding_dim))\n\n# The vectors need to be in the same position as their index. \n# Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on\n\n# Loop over all words in the word index\nfor word, i in word_index.items():\n    # If we are above the amount of words we want to use we do nothing\n    if i >= max_words: \n        continue\n    # Get the embedding vector for the word\n    embedding_vector = embeddings_index.get(word)\n    # If there is an embedding vector, put it in the embedding matrix\n    if embedding_vector is not None: \n        embedding_matrix[i] = embedding_vector",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "67e35b260d36f211737ab9e8888b77624a8ded38",
        "_cell_guid": "f1dc6336-f412-4f31-914a-97332809907e",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "from keras.models import Sequential\nfrom keras.layers import Embedding, Flatten, Dense\nmodel = Sequential()\nmodel.add(Embedding(max_words, \n                    embedding_dim, \n                    input_length=maxlen, \n                    weights = [embedding_matrix], trainable = False))\nmodel.add(Flatten())\nmodel.add(Dense(1, activation='sigmoid'))\nmodel.summary()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "bd076e0161f32ed35f0f0d6eada40830eff41baa",
        "_cell_guid": "4a5bd09b-7403-4268-9195-f46758ee5da6",
        "trusted": false
      },
      "cell_type": "code",
      "source": "model.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['acc'])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "c5bc6ed3f0c741e03fe360b44a612f4603475be5",
        "_cell_guid": "ba10a81f-4ef3-4f08-95c5-41c0dd28f59f",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "history = model.fit(X_train, y_train,\n                    epochs=10,\n                    batch_size=32,\n                    validation_data=(X_test, y_test))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "f259e11e9fd8eef34e5a5e5b6f3ce49104e3b368",
        "_cell_guid": "4bca2a70-697d-4503-9adb-5ee461f0c43b",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from keras.layers import CuDNNLSTM\nmodel = Sequential()\nmodel.add(Embedding(max_words, \n                    embedding_dim, \n                    input_length=maxlen, \n                    weights = [embedding_matrix], trainable = False))\nmodel.add(CuDNNLSTM(32))\nmodel.add(Dense(1, activation='sigmoid'))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "7fb806848d74dfc86f7fb3ce22b2372468ba9210",
        "_cell_guid": "294ecb50-47c7-4bb5-977d-bd7223c0a194",
        "trusted": false
      },
      "cell_type": "code",
      "source": "model.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['acc'])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "9e7e84aad05dee260fae5cfa9fcd987946cb0249",
        "_cell_guid": "76e2663c-6de2-4600-a82a-41ef52d17d45",
        "trusted": false
      },
      "cell_type": "code",
      "source": "history = model.fit(X_train, y_train,\n                    epochs=10,\n                    batch_size=32,\n                    validation_data=(X_test, y_test))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "87ff7a4e7127c1d600a4be87aeccaf500bb13c1d",
        "_cell_guid": "1ac12f96-ce36-4ef0-be20-90b5584b6c49",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from keras.layers import Bidirectional\nmodel = Sequential()\nmodel.add(Embedding(max_words, \n                    embedding_dim, \n                    input_length=maxlen, \n                    weights = [embedding_matrix], trainable = False))\nmodel.add(Bidirectional(CuDNNLSTM(32)))\nmodel.add(Dense(1, activation='sigmoid'))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "3d6dccd98dd6431740eb583e6949cabcdb694c8f",
        "_cell_guid": "484d1b4e-f256-428d-8f4f-4d9309bc8c01",
        "trusted": false
      },
      "cell_type": "code",
      "source": "model.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['acc'])",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "1383614bb50a6ad19f8f8df896bf11bb38de7e10",
        "_cell_guid": "af84529c-9c4c-495c-8ac6-ccbe06927cd9",
        "trusted": false
      },
      "cell_type": "code",
      "source": "history = model.fit(X_train, y_train,\n                    epochs=10,\n                    batch_size=32,\n                    validation_data=(X_test, y_test))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "4ca248b1a93edbd5594735db8b4ac4e8aca8cbab",
        "_cell_guid": "c9559831-b2a5-4d86-9468-57a0ca783239",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from keras.layers import Bidirectional\nmodel = Sequential()\nmodel.add(Embedding(max_words, \n                    embedding_dim, \n                    input_length=maxlen, \n                    weights = [embedding_matrix], trainable = False))\nmodel.add(Bidirectional(CuDNNLSTM(64,return_sequences=True)))\nmodel.add(Bidirectional(CuDNNLSTM(64,return_sequences=True)))\nmodel.add(Bidirectional(CuDNNLSTM(64,return_sequences=True)))\nmodel.add(Bidirectional(CuDNNLSTM(32)))\n\nmodel.add(Dense(1, activation='sigmoid'))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "bd2bdcdcaceb6d5383dd080b8f9c1be458268e37",
        "_cell_guid": "7337bb6f-fa03-47d6-91e8-4f3977ab343c",
        "trusted": false
      },
      "cell_type": "code",
      "source": "model.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['acc'])\nhistory = model.fit(X_train, y_train,\n                    epochs=10,\n                    batch_size=32,\n                    validation_data=(X_test, y_test))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "73e8f3343e37781f6204fe616d61ec3d3257709f",
        "_cell_guid": "47e3445d-ec5f-49f4-afd1-1047a0070ff8"
      },
      "cell_type": "markdown",
      "source": "# Attention"
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "b5d36e45bb04f7687143ddd7d9b3c590b5cdf6da",
        "_cell_guid": "c481d478-e76e-42cc-bbee-b8e2774fadfd",
        "trusted": false
      },
      "cell_type": "code",
      "source": "from keras.layers import Multiply, CuDNNLSTM, Permute, Reshape, Dense, Lambda, Input, Embedding, RepeatVector\nimport keras.backend as K\nfrom keras.layers import LSTM\nfrom keras.models import Model",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "3a7d2949-5ec3-437d-83e9-b10e82f40720",
        "_uuid": "dc217be0e1119ea3e95171b0e093406252cba2ec",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "INPUT_DIM = embedding_dim\nTIME_STEPS = maxlen\nSINGLE_ATTENTION_VECTOR = False",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "68b5036a-0d4f-4eb5-b4e0-928d0a03b6a8",
        "_uuid": "33fc3eff7055b154c8be725c49fd547461944ca1",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "from keras.layers import *\nfrom keras.layers.core import *\nfrom keras.layers.recurrent import LSTM\nfrom keras.models import *",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_cell_guid": "2c25f8ea-b951-4635-b68d-6ffb8369597f",
        "_uuid": "55a300826e4ca14bb3623829e602c258da61d140",
        "collapsed": true,
        "trusted": false
      },
      "cell_type": "code",
      "source": "def attention_3d_block(inputs,time_steps,single_attention_vector = False):\n    # inputs.shape = (batch_size, time_steps, input_dim)\n    input_dim = int(inputs.shape[2])\n    a = Permute((2, 1),name='Attent_Permute')(inputs)\n    a = Reshape((input_dim, time_steps),name='Reshape')(a) # this line is not useful. It's just to know which dimension is what.\n    a = Dense(time_steps, activation='softmax', name='Attent_Dense')(a) # Create attention vector\n    if single_attention_vector:\n        # If we just need one attention vector it over all input dimensions\n        a = Lambda(lambda x: K.mean(x, axis=1), name='Dim_reduction')(a) \n        a = RepeatVector(input_dim, name='Repeat')(a)\n    a_probs = Permute((2, 1), name='Attention_vec')(a) # Swap time steps, input dim axis back\n    output_attention_mul = Multiply(name='Attention_mul')([inputs, a_probs]) # Multiply input with attention vector\n    return output_attention_mul",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "dd94fb1fffd143be6c161462d5620c219eaf420c",
        "_cell_guid": "ab51696e-1174-4b38-8f9c-706b247a6fa9",
        "trusted": false
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "848fac340b791c6be5acaa6bddc2573e58ffef5c",
        "_cell_guid": "dba7e9d1-bb4f-4b40-8ea9-bcebb9a62eb3",
        "trusted": false
      },
      "cell_type": "code",
      "source": "input_tokens = Input(shape=(maxlen,),name='input')\n\nembedding = Embedding(max_words, \n                      embedding_dim, \n                      input_length=maxlen, \n                      weights = [embedding_matrix], \n                      trainable = False, name='embedding')(input_tokens)\n\nattention_mul = attention_3d_block(inputs = embedding,\n                                   time_steps = maxlen,\n                                   single_attention_vector = True)\n\nlstm_out = CuDNNLSTM(32, return_sequences=True, name='lstm')(attention_mul)\n\n\n\nattention_mul = Flatten(name='flatten')(attention_mul)\noutput = Dense(1, activation='sigmoid',name='output')(attention_mul)\nmodel = Model(input_tokens, output)\n",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "5658dc91a1ad3249f67606ef0a9a833474291c9b",
        "_cell_guid": "c2ab7a54-d667-473a-aea6-38aa5e2aded6",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "model.summary()",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "308d4cf031a26c03afb32121a7ee2e2802e7e3be",
        "_cell_guid": "b998ef0b-92e2-4836-a7a8-c0c39195ce73",
        "trusted": false,
        "collapsed": true
      },
      "cell_type": "code",
      "source": "model.compile(optimizer='adam',\n              loss='binary_crossentropy',\n              metrics=['acc'])\nhistory = model.fit(X_train, y_train,\n                    epochs=10,\n                    batch_size=32,\n                    validation_data=(X_test, y_test))",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "_uuid": "80beef42ce6897ce3d95a847f8e453fca73dddbc",
        "_cell_guid": "2fa46bb2-23bc-4576-9986-539d91f7258b"
      },
      "cell_type": "markdown",
      "source": "# Similarity"
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "3deb06532737e9102da392b064f9094812b3e4fb",
        "_cell_guid": "7f40e8a4-6666-4566-829d-d3e97a7f68e5",
        "trusted": false
      },
      "cell_type": "code",
      "source": "nlp = spacy.load('en')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "8fd3cfcbd3b8f56bd5b5ecb75c9beb9c62a05e72",
        "_cell_guid": "3565bf5d-952c-4d65-b606-02bb197378e9",
        "trusted": false
      },
      "cell_type": "code",
      "source": "sup1 = nlp('I would like to open a new checking account')\nsup2 = nlp('How do I open a checking account?')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "22e202af2090b6d6e4409b332aa5bd3b19f41ec8",
        "_cell_guid": "710b92bd-6f00-4d35-b838-b86b44ad9c07",
        "trusted": false
      },
      "cell_type": "code",
      "source": "sup1.similarity(sup2)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "4be12193ef0e61d0b8539c360f8998fb1029a4f2",
        "_cell_guid": "fa782be2-d9c0-4218-b5cb-6358639e38d1",
        "trusted": false
      },
      "cell_type": "code",
      "source": "sup3 = nlp('I want to close my checking account')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "d1152d966d1274bd4ed4a065d251f3bb9ffc0f94",
        "_cell_guid": "79b3e396-1372-4898-ac2d-b4337bfa9d8b",
        "trusted": false
      },
      "cell_type": "code",
      "source": "sup1.similarity(sup3)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "463f43b2b11180982e1a233c7452c362beccf0ec",
        "_cell_guid": "986e5d2e-a862-4822-970a-2026e86c72c8",
        "trusted": false
      },
      "cell_type": "code",
      "source": "sup4 = nlp('I like checking the news')",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "51404d196b7f4ae555a59fd264e5025798184071",
        "_cell_guid": "bd596f74-7074-4401-bf30-09ccd7e0429e",
        "trusted": false
      },
      "cell_type": "code",
      "source": "sup1.similarity(sup4)",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "c6aee51bff6bb4c6f9f79d8f658ac82073487583",
        "_cell_guid": "dbff75ea-c178-47ad-8afc-d0722c2b5053",
        "trusted": false
      },
      "cell_type": "code",
      "source": "import sense2vec",
      "execution_count": null,
      "outputs": []
    },
    {
      "metadata": {
        "collapsed": true,
        "_uuid": "2955e7ccd3ccc2ada4efc8d8cb9670d74a368c70",
        "_cell_guid": "5943ff8c-4e88-48bd-981b-5599b830daaf",
        "trusted": false
      },
      "cell_type": "code",
      "source": "def attention_3d_block(inputs,maxlen,single_attention_vector = False):\n    # inputs.shape = (batch_size, time_steps, input_dim)\n    input_dim = int(inputs.shape[2])\n    time_steps = int(inputs.shape[1])\n    #print(input_dim,time_steps)\n    a = Permute((2, 1))(inputs) # Swap axis 1 & 2\n    a = Reshape((input_dim, maxlen))(a) # this line is not useful. It's just to know which dimension is what.\n    a = Dense(maxlen, activation='softmax')(a) # Create dense layer to apply to input\n    if single_attention_vector:\n        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)\n        a = RepeatVector(input_dim)(a)\n    a_probs = Permute((2, 1), name='attention_vec')(a)\n    output_attention_mul = Multiply(name='attention_mul')([inputs, a_probs])\n    return output_attention_mul",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.4",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 1
}