{ "cells": [ { "metadata": { "collapsed": true, "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a", "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "trusted": false }, "cell_type": "code", "source": "import numpy as np\nimport pandas as pd", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "2c308efde1fe4889500ee5c49df7145c29ba7fd4", "_cell_guid": "00abb666-e575-4785-a1a3-02bb58590df7", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "!ls ../input", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "d44fe8e6b84bce96399f2435583b6feec0b7cd0d", "_cell_guid": "39064b57-38ce-4310-a904-fbbfbde53e47", "trusted": false }, "cell_type": "code", "source": "import codecs\ninput_file = codecs.open('../input/disasters-on-social-media/socialmedia-disaster-tweets-DFE.csv', \n 'r',\n encoding='utf-8', \n errors='replace')\noutput_file = open('clean_socialmedia-disaster.csv', 'w')\n \nfor line in input_file:\n out = line\n output_file.write(line)\n", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "6a630a757e8576000f55dfa9d47bb8588fe8b4f8", "_cell_guid": "9c5a237c-53ec-4658-9361-cbd4f8264545", "trusted": false }, "cell_type": "code", "source": "input_file.close()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "e62d9d3b7ab866b365a5e559c41166256e6b5a2f", "_cell_guid": "44956c28-a0f2-44c5-8357-980f8af0c0f5", "trusted": false }, "cell_type": "code", "source": "output_file.close()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "cc4d7c47dead195f110c14d6d5d69846519f6a48", "_cell_guid": "65d3add6-d004-4f0d-8d76-a8911f9c750d", "trusted": false }, "cell_type": "code", "source": "df = pd.read_csv('clean_socialmedia-disaster.csv')", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "2364d37d3b1f189988ce40f0017d44846615f607", "_cell_guid": "21f03b7a-959e-4aca-a77e-ef79b0b34b40", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.head()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "2e523f8e869affd075fbe49c29979cf0faef01d5", "_cell_guid": "d60abee3-1dab-4f88-8000-04654bb49ef0", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.shape", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "9eb71e33dbc95d5645e10f80881637fea0fd9c23", "_cell_guid": "12fc0a9f-4fc4-49c4-90ea-5795344e9a34", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.choose_one.unique()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "3a28f1f7bc1997b61b06f3c002736273eb70a5d6", "_cell_guid": "898d4317-d0a1-4352-a914-bb0644e5e476", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df['choose_one'].value_counts().plot(kind='bar')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "1692c9ca79db9937ea3e69426af257b4f045e59b", "_cell_guid": "8068263e-b24f-429d-9d46-0b75bc092b4d", "trusted": false }, "cell_type": "code", "source": "df = df[df.choose_one != \"Can't Decide\"]", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "802af3ed901ad8ecd4fd52a27c2723ec697cf883", "_cell_guid": "6193dd41-7d41-4942-87cc-129264c260b0", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.shape", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "7859dea634be4d45e8db7d0a553f88033670a2bf", "_cell_guid": "27e9dcc2-0a2c-44d8-8c76-02ca6e78cae9", "trusted": false }, "cell_type": "code", "source": "df = df[['text','choose_one']]", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "3ec9b954eb4a7a96be0cc84afa72bb7ccee79680", "_cell_guid": "c2035dbe-a0d1-4de1-b2e3-64b19f373480", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.head()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "addb57d580d894cc3f2d82e59734f7df334770e6", "_cell_guid": "73a35c46-9148-43ca-baa2-9a1ed99b41e9", "trusted": false }, "cell_type": "code", "source": "df['relevant'] = df.choose_one.map({'Relevant':1,'Not Relevant':0})", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "4e177724a2f53bd71c5062074eb3fa5482a9b545", "_cell_guid": "4808d1d5-3072-4cfd-8568-174794b105ae", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.head()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "683ce9a5cdaab3f4028690fbf9419a3a3be94457", "_cell_guid": "6e9cb347-8e9f-40e9-aa4b-ddd396f67dc1", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.describe()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "4811145728d79c4cdcf432910cf5188d392a918a", "_cell_guid": "de7a666e-abea-4d8c-96cc-f0487363c620" }, "cell_type": "markdown", "source": "# Lemmatization" }, { "metadata": { "collapsed": true, "_uuid": "413fbc49a7669a9b2f229b2331badc96242e4db5", "_cell_guid": "3a606a1e-cc2b-4ece-8049-fc314b8706cb", "trusted": false }, "cell_type": "code", "source": "import spacy", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "f186bdd90856bdd20ed500e28a85e9576648155a", "_cell_guid": "7f7a2b1b-3e89-4272-834b-fae5eb7da830", "trusted": false }, "cell_type": "code", "source": "nlp = spacy.load('en',disable=['tagger','parser','ner'])", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "c465fe8007f47352b0ff6d5540b69bdf401e3464", "_cell_guid": "cf5b018d-ac0c-4a74-97f1-d9e85ebd5f3a", "trusted": false }, "cell_type": "code", "source": "from tqdm import tqdm, tqdm_notebook\ntqdm.pandas(tqdm_notebook)", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "5f6c14cc863360f6233d89d90528886b97dd790d", "_cell_guid": "08527279-9f8b-4739-93f6-84678551db45", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df['lemmas'] = df[\"text\"].progress_apply(lambda row: \n [w.lemma_ for w in nlp(row)])", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "65fbb1885819974193c8df569681f8dd3962aef8", "_cell_guid": "5429aea8-6066-4a3d-a129-560e71330c49", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.head()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "64337904344f9a66eadbd17245fa2c51c168ee5d", "_cell_guid": "1053aa84-fddf-47e4-82a6-d18e41aba506", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df['joint_lemmas'] = df['lemmas'].progress_apply(lambda row: ' '.join(row))", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "22eb152feddf5fc5c3c4f8f25475278214f03d0c", "_cell_guid": "1d39c48f-c4d8-4ed2-a353-ba5c8b67d28a", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "df.head()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "815ea14477cf9059329bd11e16f02f5fc93e6960", "_cell_guid": "564d3ec1-1095-4462-a7e7-e976acee1bae" }, "cell_type": "markdown", "source": "# Bag of words" }, { "metadata": { "collapsed": true, "_uuid": "b3ec80840bc0edb7e4829008c365e242065729cf", "_cell_guid": "2c8f1734-740f-48f6-931e-311d092e8b7a", "trusted": false }, "cell_type": "code", "source": "from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(df['joint_lemmas'], \n df['relevant'], \n test_size=0.2,\n random_state=40)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "ca726056dc4ba9758407c3755dc667ed2eee122b", "_cell_guid": "c7be9c52-4642-4b17-8b1c-855a4ccc6121", "trusted": false }, "cell_type": "code", "source": "\nfrom sklearn.feature_extraction.text import CountVectorizer\n\ncount_vectorizer = CountVectorizer(max_features=5000)\nX_train_counts = count_vectorizer.fit_transform(X_train)\nX_test_counts = count_vectorizer.transform(X_test)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "a5603f2985bda0442b5a19eeddf274a8757cf16d", "_cell_guid": "0696dc01-a0b5-48f3-a4c7-a48f9f029b5f", "trusted": false }, "cell_type": "code", "source": "X_train_counts.shape", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "0d241b3b7627f8c4fc61c68930f1f98e38a6af4d", "_cell_guid": "369f85ff-b263-4fbf-b76a-f967742c4bcc", "trusted": false }, "cell_type": "code", "source": "from sklearn.decomposition import TruncatedSVD\nimport matplotlib.pyplot as plt\nimport matplotlib.patches as mpatches\nimport matplotlib\n\n \n\nlsa = TruncatedSVD(n_components=2)\nlsa.fit(X_train_counts)\nlsa_scores = lsa.transform(X_train_counts)\n\n\nfig = plt.figure(figsize=(16, 16)) \ncolors = ['orange','blue']\n\nplt.scatter(lsa_scores[:,0], \n lsa_scores[:,1], \n s=8, alpha=.8, \n c=y_train,\n cmap=matplotlib.colors.ListedColormap(colors))\n\nir_patch = mpatches.Patch(color='Orange',label='Irrelevant')\n\ndis_patch = mpatches.Patch(color='Blue',label='Disaster')\n\nplt.legend(handles=[ir_patch, dis_patch], prop={'size': 30})\n\nplt.show()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "c81a3571b9b9f99e03578280fde240f777e284e4", "_cell_guid": "72371d13-e2ae-4639-a910-adb436b3ff41", "trusted": false }, "cell_type": "code", "source": "from sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import accuracy_score\nclf = LogisticRegression()\n\nclf.fit(X_train_counts, y_train)\n\ny_predicted = clf.predict(X_test_counts)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "5ad6fc4281ec6d3b870ff548dab0f5bbbee5681f", "_cell_guid": "cf4cd8ef-b053-4454-924c-e8e14309d8a1", "trusted": false }, "cell_type": "code", "source": "accuracy_score(y_test, y_predicted)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "88fac49aad6ededc62b7c5ebc5519547b94b3bab", "_cell_guid": "0615a89f-b61a-40f3-9fdf-fbac04d70886", "trusted": false }, "cell_type": "code", "source": "import numpy as np\nimport itertools\nfrom sklearn.metrics import confusion_matrix\n\ndef plot_confusion_matrix(cm, classes,\n normalize=False,\n title='Confusion matrix',\n cmap=plt.cm.winter):\n if normalize:\n cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title, fontsize=30)\n plt.colorbar()\n tick_marks = np.arange(len(classes))\n plt.xticks(tick_marks, classes, fontsize=20)\n plt.yticks(tick_marks, classes, fontsize=20)\n \n fmt = '.2f' if normalize else 'd'\n thresh = cm.max() / 2.\n\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n plt.text(j, i, format(cm[i, j], fmt), horizontalalignment=\"center\", \n color=\"white\" if cm[i, j] < thresh else \"black\", fontsize=40)\n \n plt.tight_layout()\n plt.ylabel('True label', fontsize=30)\n plt.xlabel('Predicted label', fontsize=30)\n\n return plt", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "01be8be6a0d160ad59da17553d556d3bf7ae3a36", "_cell_guid": "980c484f-5994-4fd0-9009-25dd620f002a", "trusted": false }, "cell_type": "code", "source": "cm = confusion_matrix(y_test, y_predicted)\nfig = plt.figure(figsize=(10, 10))\nplot = plot_confusion_matrix(cm, classes=['Irrelevant','Disaster'], normalize=False, title='Confusion matrix')\nplt.show()\nprint(cm)\n", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "230f3949f4cb5713a44121f87939a684c06d3789", "_cell_guid": "a3ecd467-9ae0-4097-bce2-07876c8f05bf", "trusted": false }, "cell_type": "code", "source": "def get_most_important_features(vectorizer, model, n=5):\n index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}\n \n # loop for each class\n classes ={}\n for class_index in range(model.coef_.shape[0]):\n word_importances = [(el, index_to_word[i]) for i,el in enumerate(model.coef_[class_index])]\n sorted_coeff = sorted(word_importances, key = lambda x : x[0], reverse=True)\n tops = sorted(sorted_coeff[:n], key = lambda x : x[0])\n bottom = sorted_coeff[-n:]\n classes[class_index] = {\n 'tops':tops,\n 'bottom':bottom\n }\n return classes\n\nimportance = get_most_important_features(count_vectorizer, clf, 10)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "bda2d90fd49c25e9ac7885fc7ccbd3b0844492e9", "_cell_guid": "45e37db0-3cf1-4dd5-801a-fadc08815bf7", "trusted": false }, "cell_type": "code", "source": "def plot_important_words(top_scores, top_words, bottom_scores, bottom_words, name):\n y_pos = np.arange(len(top_words))\n top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]\n top_pairs = sorted(top_pairs, key=lambda x: x[1])\n \n bottom_pairs = [(a,b) for a,b in zip(bottom_words, bottom_scores)]\n bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)\n \n top_words = [a[0] for a in top_pairs]\n top_scores = [a[1] for a in top_pairs]\n \n bottom_words = [a[0] for a in bottom_pairs]\n bottom_scores = [a[1] for a in bottom_pairs]\n \n fig = plt.figure(figsize=(10, 10)) \n\n plt.subplot(121)\n plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)\n plt.title('Irrelevant', fontsize=20)\n plt.yticks(y_pos, bottom_words, fontsize=14)\n plt.suptitle('Key words', fontsize=16)\n plt.xlabel('Importance', fontsize=20)\n \n plt.subplot(122)\n plt.barh(y_pos,top_scores, align='center', alpha=0.5)\n plt.title('Disaster', fontsize=20)\n plt.yticks(y_pos, top_words, fontsize=14)\n plt.suptitle(name, fontsize=16)\n plt.xlabel('Importance', fontsize=20)\n \n plt.subplots_adjust(wspace=0.8)\n plt.show()\n\ntop_scores = [a[0] for a in importance[0]['tops']]\ntop_words = [a[1] for a in importance[0]['tops']]\nbottom_scores = [a[0] for a in importance[0]['bottom']]\nbottom_words = [a[1] for a in importance[0]['bottom']]\n\nplot_important_words(top_scores, top_words, bottom_scores, bottom_words, \"Most important words for relevance\")", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "357989b0237da7a84778df374ebcd5e876724c87", "_cell_guid": "9d1217cb-172f-46b4-92e5-0fa73f8a9208", "trusted": false }, "cell_type": "code", "source": "from sklearn.feature_extraction.text import TfidfVectorizer\ntfidf_vectorizer = TfidfVectorizer(max_features=10000)\n\nX_train_tfidf = tfidf_vectorizer.fit_transform(X_train)\nX_test_tfidf = tfidf_vectorizer.transform(X_test)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "c7c828806f3408d5de847b264adcb9789730a8b9", "_cell_guid": "aa0ac1e0-e6fa-47a9-a439-542119c07a46", "trusted": false }, "cell_type": "code", "source": "lsa = TruncatedSVD(n_components=2)\nlsa.fit(X_train_tfidf)\nlsa_scores = lsa.transform(X_train_tfidf)\n\nfig = plt.figure(figsize=(16, 16)) \ncolors = ['orange','blue']\n\nplt.scatter(lsa_scores[:,0], \n lsa_scores[:,1], \n s=8, alpha=.8, \n c=y_train,\n cmap=matplotlib.colors.ListedColormap(colors))\n\nir_patch = mpatches.Patch(color='Orange',label='Irrelevant')\n\ndis_patch = mpatches.Patch(color='Blue',label='Disaster')\n\nplt.legend(handles=[ir_patch, dis_patch], prop={'size': 30})\nplt.show()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "89524e9b6f5d4280208e5b980c9f1dae84047c39", "_cell_guid": "51b1dc95-e052-41c2-a9dc-a67a3980e7f7", "trusted": false }, "cell_type": "code", "source": "clf_tfidf = LogisticRegression()\nclf_tfidf.fit(X_train_tfidf, y_train)\n\ny_predicted_tfidf = clf_tfidf.predict(X_test_tfidf)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "7bbea19aca4c7e58db14dc3c0eb8ac771a35f409", "_cell_guid": "c098db90-271f-41e1-a6c4-30daddb8e237", "trusted": false }, "cell_type": "code", "source": "accuracy_score(y_pred=y_predicted_tfidf, y_true=y_test)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "2d01271de40c59b6e20aa3dd28c3082da134481d", "_cell_guid": "c5d744cd-d7f6-403a-ad11-cd1c2e46bf49", "trusted": false }, "cell_type": "code", "source": "cm2 = confusion_matrix(y_test, y_predicted_tfidf)\nfig = plt.figure(figsize=(10, 10))\nplot = plot_confusion_matrix(cm2, classes=['Irrelevant','Disaster'], normalize=False, title='Confusion matrix')\nplt.show()\nprint(\"TFIDF confusion matrix\")\nprint(cm2)\nprint(\"BoW confusion matrix\")\nprint(cm)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "c7a959e2320b35442649e5ff5648f77e32d47f13", "_cell_guid": "e73f1bfd-cfec-4e58-91f3-6049117f8faf", "trusted": false }, "cell_type": "code", "source": "importance_tfidf = get_most_important_features(tfidf_vectorizer, clf_tfidf, 10)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "4bb515eb1aab6925437de2b802d800415dfbd126", "_cell_guid": "43d315db-1671-4b9a-9e50-cfafb39a21b0", "trusted": false }, "cell_type": "code", "source": "top_scores = [a[0] for a in importance_tfidf[0]['tops']]\ntop_words = [a[1] for a in importance_tfidf[0]['tops']]\nbottom_scores = [a[0] for a in importance_tfidf[0]['bottom']]\nbottom_words = [a[1] for a in importance_tfidf[0]['bottom']]\n\nplot_important_words(top_scores, top_words, bottom_scores, bottom_words, \"Most important words for relevance\")", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "74e0f605a2ee13f284b907a9f43ccdb453cb25a4", "_cell_guid": "39e2fe8b-ee17-478c-8d2b-42528e7f5f2f" }, "cell_type": "markdown", "source": "# Word vectors" }, { "metadata": { "_uuid": "68fa9370a74ec5f244facd5576fe37a93561d018", "_cell_guid": "1202bc67-e061-4c3f-98a2-9f5ec2b88e57", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "from keras.preprocessing.text import Tokenizer\nimport numpy as np\n\nmax_words = 10000 # We will only consider the 10K most used words in this dataset", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "7f83c302c2adf3b8273216270fc94f2d7f5c23d6", "_cell_guid": "9a400915-29d5-40dc-b1bb-312d592d3a02", "trusted": false }, "cell_type": "code", "source": "tokenizer = Tokenizer(num_words=max_words) # Setup\ntokenizer.fit_on_texts(df['joint_lemmas']) # Generate tokens by counting frequency\nsequences = tokenizer.texts_to_sequences(df['joint_lemmas']) # Turn text into sequence of numbers", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "114941933a81fb2f884273f217118747ceba84e8", "_cell_guid": "a2db7532-e2bb-4e9d-9adc-6ca235545a05", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "word_index = tokenizer.word_index\nprint('Token for \"the\"',word_index['the'])\nprint('Token for \"Movie\"',word_index['movie'])", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "29309b99def8c73987ef283a53b22d04b572118b", "_cell_guid": "7d75d768-73fa-4262-95ca-bf3dd7e49697", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "from keras.preprocessing.sequence import pad_sequences\nmaxlen = 140 # Make all sequences 140 words long\ndata = pad_sequences(sequences, maxlen=maxlen)\nprint(data.shape) # We have 25K, 140 word sequences now", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "9e5b633e923bc737f29f74dd161b47d0888c8cc1", "_cell_guid": "79062d91-7f2f-4034-a33f-3ccf4599f722", "trusted": false }, "cell_type": "code", "source": "from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(data,\n df['relevant'],\n test_size = 0.2, \n shuffle=True, \n random_state = 42)", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "ef5f17fbf27f8d306df584c0cf2e44c12910520d", "_cell_guid": "264a62b5-0595-4897-8fec-b3a107231edb" }, "cell_type": "markdown", "source": "## Training custom word vectors" }, { "metadata": { "collapsed": true, "_uuid": "c2e32a607e78fea752873cd4be5b0d53bc6d04bc", "_cell_guid": "0e86715b-579e-4053-b1d6-42a503b8603b", "trusted": false }, "cell_type": "code", "source": "from keras.models import Sequential\nfrom keras.layers import Embedding, Flatten, Dense\n\nembedding_dim = 50\n\nmodel = Sequential()\nmodel.add(Embedding(max_words, embedding_dim, input_length=maxlen))\nmodel.add(Flatten())\n#model.add(Dense(32, activation='relu'))\nmodel.add(Dense(1, activation='sigmoid'))\nmodel.summary()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "719d70a8010e21f8d45795cfa75f299e12524a4a", "_cell_guid": "968a9a27-7c29-4283-870c-a17287975c6b", "trusted": false }, "cell_type": "code", "source": "model.compile(optimizer='adam',\n loss='binary_crossentropy',\n metrics=['acc'])", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "c455dfa55385cc0b11544aeb589452d33801919a", "_cell_guid": "8d8295af-14b6-47ed-8b47-13b15110b191", "trusted": false }, "cell_type": "code", "source": "type(data)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "778e0c87f6e4236d71184cd1cb764ee63ef5d843", "_cell_guid": "e234982b-6248-4278-8240-b3cb07cc676c", "trusted": false }, "cell_type": "code", "source": "history = model.fit(X_train, y_train,\n epochs=10,\n batch_size=32,\n validation_data=(X_test, y_test))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "42b2196a8328a4fd972332cbc0a8b9d5aaf60dbb", "_cell_guid": "89923384-9de0-4073-9f5d-2426776c3161", "trusted": false }, "cell_type": "code", "source": "from keras.models import Sequential\nfrom keras.layers import Embedding, LSTM, Dense\n\nembedding_dim = 50\n\nmodel = Sequential()\nmodel.add(Embedding(max_words, embedding_dim, input_length=maxlen))\nmodel.add(LSTM(32))\nmodel.add(Dense(1, activation='sigmoid'))\nmodel.summary()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "a428ad420748e3d514005eb5b5e4b4db535b16f2", "_cell_guid": "34d87334-e8ce-46df-bd82-51307b25e9a7", "trusted": false }, "cell_type": "code", "source": "model.compile(optimizer='adam',\n loss='binary_crossentropy',\n metrics=['acc'])", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "b941198ba2f5e2cba8b795c721ea76782426b186", "_cell_guid": "8748ea11-aaf2-4285-bd25-c438dc92979a", "trusted": false }, "cell_type": "code", "source": "history = model.fit(X_train, y_train,\n epochs=10,\n batch_size=32,\n validation_data=(X_test, y_test))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "245f156cc293e53f52ab3ac626e96dcde625b33a", "_cell_guid": "8527ecb6-3e88-41c8-a9d5-2f0c220ab335" }, "cell_type": "markdown", "source": "## Using pre trained word vectors" }, { "metadata": { "_uuid": "979b2a21489dbd72f877e904ba6337a685e9b3f9", "_cell_guid": "dbfeeec9-22c3-4e86-afbb-d2dad9e934db", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "!ls ../input/glove6b50d", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "9c0ffe0d969ffe2ea50e899b0c3f7834616388a4", "_cell_guid": "a62a2619-7002-4394-8f19-c800b6c856f2", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "import os\nglove_dir = '../input/glove6b50d' # This is the folder with the dataset\n\nembeddings_index = {} # We create a dictionary of word -> embedding\nf = open(os.path.join(glove_dir, 'glove.6B.50d.txt')) # Open file\n\n# In the dataset, each line represents a new word embedding\n# The line starts with the word and the embedding values follow\nfor line in f:\n values = line.split()\n word = values[0] # The first value is the word, the rest are the values of the embedding\n embedding = np.asarray(values[1:], dtype='float32') # Load embedding\n embeddings_index[word] = embedding # Add embedding to our embedding dictionary\nf.close()\n\nprint('Found %s word vectors.' % len(embeddings_index))", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "d478c9191174fbcae98d2a3d5715f6f8059eecc2", "_cell_guid": "0329b8bd-af00-4529-916f-272c4d3d9bf8", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "all_embs = np.stack(embeddings_index.values())\nemb_mean = all_embs.mean() # Calculate mean\nemb_std = all_embs.std() # Calculate standard deviation\nemb_mean,emb_std", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "9255b95a0476cb3513b6d3f7720a3e26dd376a7a", "_cell_guid": "1a6293dc-529a-4a5c-bbef-ce6a65674b73", "trusted": false }, "cell_type": "code", "source": "embedding_dim = 50\n\nword_index = tokenizer.word_index\nnb_words = min(max_words, len(word_index)) # How many words are there actually\n\n# Create a random matrix with the same mean and std as the embeddings\nembedding_matrix = np.random.normal(emb_mean, \n emb_std, \n (nb_words, embedding_dim))\n\n# The vectors need to be in the same position as their index. \n# Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on\n\n# Loop over all words in the word index\nfor word, i in word_index.items():\n # If we are above the amount of words we want to use we do nothing\n if i >= max_words: \n continue\n # Get the embedding vector for the word\n embedding_vector = embeddings_index.get(word)\n # If there is an embedding vector, put it in the embedding matrix\n if embedding_vector is not None: \n embedding_matrix[i] = embedding_vector", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "67e35b260d36f211737ab9e8888b77624a8ded38", "_cell_guid": "f1dc6336-f412-4f31-914a-97332809907e", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "from keras.models import Sequential\nfrom keras.layers import Embedding, Flatten, Dense\nmodel = Sequential()\nmodel.add(Embedding(max_words, \n embedding_dim, \n input_length=maxlen, \n weights = [embedding_matrix], trainable = False))\nmodel.add(Flatten())\nmodel.add(Dense(1, activation='sigmoid'))\nmodel.summary()", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "bd076e0161f32ed35f0f0d6eada40830eff41baa", "_cell_guid": "4a5bd09b-7403-4268-9195-f46758ee5da6", "trusted": false }, "cell_type": "code", "source": "model.compile(optimizer='adam',\n loss='binary_crossentropy',\n metrics=['acc'])", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "c5bc6ed3f0c741e03fe360b44a612f4603475be5", "_cell_guid": "ba10a81f-4ef3-4f08-95c5-41c0dd28f59f", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "history = model.fit(X_train, y_train,\n epochs=10,\n batch_size=32,\n validation_data=(X_test, y_test))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "f259e11e9fd8eef34e5a5e5b6f3ce49104e3b368", "_cell_guid": "4bca2a70-697d-4503-9adb-5ee461f0c43b", "trusted": false }, "cell_type": "code", "source": "from keras.layers import CuDNNLSTM\nmodel = Sequential()\nmodel.add(Embedding(max_words, \n embedding_dim, \n input_length=maxlen, \n weights = [embedding_matrix], trainable = False))\nmodel.add(CuDNNLSTM(32))\nmodel.add(Dense(1, activation='sigmoid'))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "7fb806848d74dfc86f7fb3ce22b2372468ba9210", "_cell_guid": "294ecb50-47c7-4bb5-977d-bd7223c0a194", "trusted": false }, "cell_type": "code", "source": "model.compile(optimizer='adam',\n loss='binary_crossentropy',\n metrics=['acc'])", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "9e7e84aad05dee260fae5cfa9fcd987946cb0249", "_cell_guid": "76e2663c-6de2-4600-a82a-41ef52d17d45", "trusted": false }, "cell_type": "code", "source": "history = model.fit(X_train, y_train,\n epochs=10,\n batch_size=32,\n validation_data=(X_test, y_test))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "87ff7a4e7127c1d600a4be87aeccaf500bb13c1d", "_cell_guid": "1ac12f96-ce36-4ef0-be20-90b5584b6c49", "trusted": false }, "cell_type": "code", "source": "from keras.layers import Bidirectional\nmodel = Sequential()\nmodel.add(Embedding(max_words, \n embedding_dim, \n input_length=maxlen, \n weights = [embedding_matrix], trainable = False))\nmodel.add(Bidirectional(CuDNNLSTM(32)))\nmodel.add(Dense(1, activation='sigmoid'))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "3d6dccd98dd6431740eb583e6949cabcdb694c8f", "_cell_guid": "484d1b4e-f256-428d-8f4f-4d9309bc8c01", "trusted": false }, "cell_type": "code", "source": "model.compile(optimizer='adam',\n loss='binary_crossentropy',\n metrics=['acc'])", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "1383614bb50a6ad19f8f8df896bf11bb38de7e10", "_cell_guid": "af84529c-9c4c-495c-8ac6-ccbe06927cd9", "trusted": false }, "cell_type": "code", "source": "history = model.fit(X_train, y_train,\n epochs=10,\n batch_size=32,\n validation_data=(X_test, y_test))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "4ca248b1a93edbd5594735db8b4ac4e8aca8cbab", "_cell_guid": "c9559831-b2a5-4d86-9468-57a0ca783239", "trusted": false }, "cell_type": "code", "source": "from keras.layers import Bidirectional\nmodel = Sequential()\nmodel.add(Embedding(max_words, \n embedding_dim, \n input_length=maxlen, \n weights = [embedding_matrix], trainable = False))\nmodel.add(Bidirectional(CuDNNLSTM(64,return_sequences=True)))\nmodel.add(Bidirectional(CuDNNLSTM(64,return_sequences=True)))\nmodel.add(Bidirectional(CuDNNLSTM(64,return_sequences=True)))\nmodel.add(Bidirectional(CuDNNLSTM(32)))\n\nmodel.add(Dense(1, activation='sigmoid'))", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "bd2bdcdcaceb6d5383dd080b8f9c1be458268e37", "_cell_guid": "7337bb6f-fa03-47d6-91e8-4f3977ab343c", "trusted": false }, "cell_type": "code", "source": "model.compile(optimizer='adam',\n loss='binary_crossentropy',\n metrics=['acc'])\nhistory = model.fit(X_train, y_train,\n epochs=10,\n batch_size=32,\n validation_data=(X_test, y_test))", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "73e8f3343e37781f6204fe616d61ec3d3257709f", "_cell_guid": "47e3445d-ec5f-49f4-afd1-1047a0070ff8" }, "cell_type": "markdown", "source": "# Attention" }, { "metadata": { "collapsed": true, "_uuid": "b5d36e45bb04f7687143ddd7d9b3c590b5cdf6da", "_cell_guid": "c481d478-e76e-42cc-bbee-b8e2774fadfd", "trusted": false }, "cell_type": "code", "source": "from keras.layers import Multiply, CuDNNLSTM, Permute, Reshape, Dense, Lambda, Input, Embedding, RepeatVector\nimport keras.backend as K\nfrom keras.layers import LSTM\nfrom keras.models import Model", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "3a7d2949-5ec3-437d-83e9-b10e82f40720", "_uuid": "dc217be0e1119ea3e95171b0e093406252cba2ec", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "INPUT_DIM = embedding_dim\nTIME_STEPS = maxlen\nSINGLE_ATTENTION_VECTOR = False", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "68b5036a-0d4f-4eb5-b4e0-928d0a03b6a8", "_uuid": "33fc3eff7055b154c8be725c49fd547461944ca1", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "from keras.layers import *\nfrom keras.layers.core import *\nfrom keras.layers.recurrent import LSTM\nfrom keras.models import *", "execution_count": null, "outputs": [] }, { "metadata": { "_cell_guid": "2c25f8ea-b951-4635-b68d-6ffb8369597f", "_uuid": "55a300826e4ca14bb3623829e602c258da61d140", "collapsed": true, "trusted": false }, "cell_type": "code", "source": "def attention_3d_block(inputs,time_steps,single_attention_vector = False):\n # inputs.shape = (batch_size, time_steps, input_dim)\n input_dim = int(inputs.shape[2])\n a = Permute((2, 1),name='Attent_Permute')(inputs)\n a = Reshape((input_dim, time_steps),name='Reshape')(a) # this line is not useful. It's just to know which dimension is what.\n a = Dense(time_steps, activation='softmax', name='Attent_Dense')(a) # Create attention vector\n if single_attention_vector:\n # If we just need one attention vector it over all input dimensions\n a = Lambda(lambda x: K.mean(x, axis=1), name='Dim_reduction')(a) \n a = RepeatVector(input_dim, name='Repeat')(a)\n a_probs = Permute((2, 1), name='Attention_vec')(a) # Swap time steps, input dim axis back\n output_attention_mul = Multiply(name='Attention_mul')([inputs, a_probs]) # Multiply input with attention vector\n return output_attention_mul", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "dd94fb1fffd143be6c161462d5620c219eaf420c", "_cell_guid": "ab51696e-1174-4b38-8f9c-706b247a6fa9", "trusted": false }, "cell_type": "code", "source": "", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "848fac340b791c6be5acaa6bddc2573e58ffef5c", "_cell_guid": "dba7e9d1-bb4f-4b40-8ea9-bcebb9a62eb3", "trusted": false }, "cell_type": "code", "source": "input_tokens = Input(shape=(maxlen,),name='input')\n\nembedding = Embedding(max_words, \n embedding_dim, \n input_length=maxlen, \n weights = [embedding_matrix], \n trainable = False, name='embedding')(input_tokens)\n\nattention_mul = attention_3d_block(inputs = embedding,\n time_steps = maxlen,\n single_attention_vector = True)\n\nlstm_out = CuDNNLSTM(32, return_sequences=True, name='lstm')(attention_mul)\n\n\n\nattention_mul = Flatten(name='flatten')(attention_mul)\noutput = Dense(1, activation='sigmoid',name='output')(attention_mul)\nmodel = Model(input_tokens, output)\n", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "5658dc91a1ad3249f67606ef0a9a833474291c9b", "_cell_guid": "c2ab7a54-d667-473a-aea6-38aa5e2aded6", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "model.summary()", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "308d4cf031a26c03afb32121a7ee2e2802e7e3be", "_cell_guid": "b998ef0b-92e2-4836-a7a8-c0c39195ce73", "trusted": false, "collapsed": true }, "cell_type": "code", "source": "model.compile(optimizer='adam',\n loss='binary_crossentropy',\n metrics=['acc'])\nhistory = model.fit(X_train, y_train,\n epochs=10,\n batch_size=32,\n validation_data=(X_test, y_test))", "execution_count": null, "outputs": [] }, { "metadata": { "_uuid": "80beef42ce6897ce3d95a847f8e453fca73dddbc", "_cell_guid": "2fa46bb2-23bc-4576-9986-539d91f7258b" }, "cell_type": "markdown", "source": "# Similarity" }, { "metadata": { "collapsed": true, "_uuid": "3deb06532737e9102da392b064f9094812b3e4fb", "_cell_guid": "7f40e8a4-6666-4566-829d-d3e97a7f68e5", "trusted": false }, "cell_type": "code", "source": "nlp = spacy.load('en')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "8fd3cfcbd3b8f56bd5b5ecb75c9beb9c62a05e72", "_cell_guid": "3565bf5d-952c-4d65-b606-02bb197378e9", "trusted": false }, "cell_type": "code", "source": "sup1 = nlp('I would like to open a new checking account')\nsup2 = nlp('How do I open a checking account?')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "22e202af2090b6d6e4409b332aa5bd3b19f41ec8", "_cell_guid": "710b92bd-6f00-4d35-b838-b86b44ad9c07", "trusted": false }, "cell_type": "code", "source": "sup1.similarity(sup2)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "4be12193ef0e61d0b8539c360f8998fb1029a4f2", "_cell_guid": "fa782be2-d9c0-4218-b5cb-6358639e38d1", "trusted": false }, "cell_type": "code", "source": "sup3 = nlp('I want to close my checking account')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "d1152d966d1274bd4ed4a065d251f3bb9ffc0f94", "_cell_guid": "79b3e396-1372-4898-ac2d-b4337bfa9d8b", "trusted": false }, "cell_type": "code", "source": "sup1.similarity(sup3)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "463f43b2b11180982e1a233c7452c362beccf0ec", "_cell_guid": "986e5d2e-a862-4822-970a-2026e86c72c8", "trusted": false }, "cell_type": "code", "source": "sup4 = nlp('I like checking the news')", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "51404d196b7f4ae555a59fd264e5025798184071", "_cell_guid": "bd596f74-7074-4401-bf30-09ccd7e0429e", "trusted": false }, "cell_type": "code", "source": "sup1.similarity(sup4)", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "c6aee51bff6bb4c6f9f79d8f658ac82073487583", "_cell_guid": "dbff75ea-c178-47ad-8afc-d0722c2b5053", "trusted": false }, "cell_type": "code", "source": "import sense2vec", "execution_count": null, "outputs": [] }, { "metadata": { "collapsed": true, "_uuid": "2955e7ccd3ccc2ada4efc8d8cb9670d74a368c70", "_cell_guid": "5943ff8c-4e88-48bd-981b-5599b830daaf", "trusted": false }, "cell_type": "code", "source": "def attention_3d_block(inputs,maxlen,single_attention_vector = False):\n # inputs.shape = (batch_size, time_steps, input_dim)\n input_dim = int(inputs.shape[2])\n time_steps = int(inputs.shape[1])\n #print(input_dim,time_steps)\n a = Permute((2, 1))(inputs) # Swap axis 1 & 2\n a = Reshape((input_dim, maxlen))(a) # this line is not useful. It's just to know which dimension is what.\n a = Dense(maxlen, activation='softmax')(a) # Create dense layer to apply to input\n if single_attention_vector:\n a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)\n a = RepeatVector(input_dim)(a)\n a_probs = Permute((2, 1), name='attention_vec')(a)\n output_attention_mul = Multiply(name='attention_mul')([inputs, a_probs])\n return output_attention_mul", "execution_count": null, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.6.4", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" } }, "nbformat": 4, "nbformat_minor": 1 }