{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyOdNY+9wuYsjywlW4gTb+x4", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "https://archive.ics.uci.edu/dataset/331/sentiment+labelled+sentences\n" ], "metadata": { "id": "sp2b6deAWEAi" } }, { "cell_type": "code", "source": [ "import pandas as pd\n", "\n", "filepath_dict = {'yelp': 'https://raw.githubusercontent.com/atharvajk98/UCI-Sentiment-Analysis/master/Dataset/yelp_labelled.txt',\n", " 'amazon': 'https://raw.githubusercontent.com/atharvajk98/UCI-Sentiment-Analysis/master/Dataset/amazon_cells_labelled.txt',\n", " 'imdb': 'https://raw.githubusercontent.com/atharvajk98/UCI-Sentiment-Analysis/master/Dataset/imdb_labelled.txt'}\n", "\n", "df_list = []\n", "for source, filepath in filepath_dict.items():\n", " df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\\t')\n", " df['source'] = source # Add another column filled with the source name\n", " df_list.append(df)\n", "\n", "df = pd.concat(df_list)\n", "print(df.iloc[0])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oq-C9PwYWDom", "outputId": "7e32fe27-6892-43e6-8aef-e071d65aef22" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "sentence Wow... Loved this place.\n", "label 1\n", "source yelp\n", "Name: 0, dtype: object\n" ] } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Qd2nhwJZV9-F" }, "outputs": [], "source": [ "sentences = ['John likes ice cream', 'John hates chocolate.']" ] }, { "cell_type": "code", "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "vectorizer = CountVectorizer(min_df=0, lowercase=False)\n", "vectorizer.fit(sentences)\n", "vectorizer.vocabulary_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6H_HQ7dAWDZu", "outputId": "06b85cb8-ce3f-4060-9d09-40d75eabfeb0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "OAYTiaXdWDW2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "vectorizer.transform(sentences).toarray()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MREny8T_WDT-", "outputId": "7756d8d8-6313-4c20-bb29-41e800bc0edd" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[1, 0, 1, 0, 1, 1],\n", " [1, 1, 0, 1, 0, 0]])" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "df_yelp = df[df['source'] == 'yelp']\n", "\n", "sentences = df_yelp['sentence'].values\n", "y = df_yelp['label'].values\n", "\n", "sentences_train, sentences_test, y_train, y_test = train_test_split(\n", " sentences, y, test_size=0.25, random_state=1000)" ], "metadata": { "id": "QgR7b87UWDOT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "vectorizer = CountVectorizer()\n", "vectorizer.fit(sentences_train)\n", "\n", "X_train = vectorizer.transform(sentences_train)\n", "X_test = vectorizer.transform(sentences_test)\n", "X_train" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xL3fcoLaXb0K", "outputId": "15ada9b5-4983-4a7f-8dea-7efe8ac03da0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "<750x1714 sparse matrix of type ''\n", "\twith 7368 stored elements in Compressed Sparse Row format>" ] }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "from sklearn.linear_model import LogisticRegression\n", "\n", "classifier = LogisticRegression()\n", "classifier.fit(X_train, y_train)\n", "score = classifier.score(X_test, y_test)\n", "\n", "print(\"Accuracy:\", score)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f_X_B-NAXbxP", "outputId": "25c31651-9ad4-41b7-c35b-656273f81e60" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Accuracy: 0.796\n" ] } ] }, { "cell_type": "code", "source": [ "for source in df['source'].unique():\n", " df_source = df[df['source'] == source]\n", " sentences = df_source['sentence'].values\n", " y = df_source['label'].values\n", "\n", " sentences_train, sentences_test, y_train, y_test = train_test_split(\n", " sentences, y, test_size=0.25, random_state=1000)\n", "\n", " vectorizer = CountVectorizer()\n", " vectorizer.fit(sentences_train)\n", " X_train = vectorizer.transform(sentences_train)\n", " X_test = vectorizer.transform(sentences_test)\n", "\n", " classifier = LogisticRegression()\n", " classifier.fit(X_train, y_train)\n", " score = classifier.score(X_test, y_test)\n", " print('Accuracy for {} data: {:.4f}'.format(source, score))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9n_E8ZTtXbup", "outputId": "30db96fa-2bec-4456-c642-cfb3f2118cd6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Accuracy for yelp data: 0.7960\n", "Accuracy for amazon data: 0.7960\n", "Accuracy for imdb data: 0.7487\n" ] } ] }, { "cell_type": "code", "source": [ "from keras.models import Sequential\n", "from keras import layers\n", "\n", "input_dim = X_train.shape[1] # Number of features\n", "\n", "model = Sequential()\n", "model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))" ], "metadata": { "id": "IMBW44G8Xbrt" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.compile(loss='binary_crossentropy',\n", " optimizer='adam',\n", " metrics=['accuracy'])\n", "model.summary()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kx5mZB4eXbpB", "outputId": "6796a2e5-fde6-4dfc-d180-5ffbda63222d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " dense (Dense) (None, 10) 25060 \n", " \n", " dense_1 (Dense) (None, 1) 11 \n", " \n", "=================================================================\n", "Total params: 25071 (97.93 KB)\n", "Trainable params: 25071 (97.93 KB)\n", "Non-trainable params: 0 (0.00 Byte)\n", "_________________________________________________________________\n" ] } ] }, { "cell_type": "code", "source": [ "history = model.fit(X_train, y_train,\n", " epochs=100,\n", " verbose=False,\n", " validation_data=(X_test, y_test),\n", " batch_size=10)" ], "metadata": { "id": "iVVg-k5EXbmC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from keras.backend import clear_session\n", "clear_session()" ], "metadata": { "id": "BwSAo1IuXbje" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "loss, accuracy = model.evaluate(X_train, y_train, verbose=False)\n", "print(\"Training Accuracy: {:.4f}\".format(accuracy))\n", "loss, accuracy = model.evaluate(X_test, y_test, verbose=False)\n", "print(\"Testing Accuracy: {:.4f}\".format(accuracy))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3vkG7Dj8Xbgk", "outputId": "625575b7-194f-4c64-b186-b200f9523c16" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training Accuracy: 1.0000\n", "Testing Accuracy: 0.7754\n" ] } ] }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "plt.style.use('ggplot')\n", "\n", "def plot_history(history):\n", " acc = history.history['accuracy']\n", " val_acc = history.history['val_accuracy']\n", " loss = history.history['loss']\n", " val_loss = history.history['val_loss']\n", " x = range(1, len(acc) + 1)\n", "\n", " plt.figure(figsize=(12, 5))\n", " plt.subplot(1, 2, 1)\n", " plt.plot(x, acc, 'b', label='Training acc')\n", " plt.plot(x, val_acc, 'r', label='Validation acc')\n", " plt.title('Training and validation accuracy')\n", " plt.legend()\n", " plt.subplot(1, 2, 2)\n", " plt.plot(x, loss, 'b', label='Training loss')\n", " plt.plot(x, val_loss, 'r', label='Validation loss')\n", " plt.title('Training and validation loss')\n", " plt.legend()" ], "metadata": { "id": "YYDDBMfVXbdi" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "plot_history(history)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 469 }, "id": "dbJDJBA9Xbat", "outputId": "0f3ba545-2b4a-416f-a6f3-94ff5a117892" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "cities = ['London', 'Berlin', 'Berlin', 'New York', 'London']\n", "cities" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5hncoX9IXbVi", "outputId": "0ea9ed9d-6703-4706-f1f9-0438b38f2576" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['London', 'Berlin', 'Berlin', 'New York', 'London']" ] }, "metadata": {}, "execution_count": 23 } ] }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "encoder = LabelEncoder()\n", "city_labels = encoder.fit_transform(cities)\n", "city_labels" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YRTyHrXWYd23", "outputId": "9ad6dd14-0260-40f4-e988-abc5a01113d5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([1, 0, 0, 2, 1])" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "\n", "encoder = OneHotEncoder(sparse=False)\n", "city_labels = city_labels.reshape((5, 1))\n", "encoder.fit_transform(city_labels)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ARmpK86eYdzv", "outputId": "08bddaf2-9470-4d7e-e8eb-7053b7ee2dda" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "array([[0., 1., 0.],\n", " [1., 0., 0.],\n", " [1., 0., 0.],\n", " [0., 0., 1.],\n", " [0., 1., 0.]])" ] }, "metadata": {}, "execution_count": 25 } ] }, { "cell_type": "code", "source": [ "from keras.preprocessing.text import Tokenizer\n", "\n", "tokenizer = Tokenizer(num_words=5000)\n", "tokenizer.fit_on_texts(sentences_train)\n", "\n", "X_train = tokenizer.texts_to_sequences(sentences_train)\n", "X_test = tokenizer.texts_to_sequences(sentences_test)\n", "\n", "vocab_size = len(tokenizer.word_index) + 1 # Adding 1 because of reserved 0 index\n", "\n", "print(sentences_train[2])\n", "print(X_train[2])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nU9LoxCPYdxM", "outputId": "88cf7843-ba17-4b57-eca2-5275483ee6a6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "I am a fan of his ... This movie sucked really bad. \n", "[7, 150, 2, 932, 4, 49, 6, 11, 563, 45, 30]\n" ] } ] }, { "cell_type": "code", "source": [ "for word in ['the', 'all', 'happy', 'sad']:\n", " print('{}: {}'.format(word, tokenizer.word_index[word]))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 193 }, "id": "AwfdcLvNYdu4", "outputId": "6073a8fd-ecba-4608-ccce-788feb2d500a" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "the: 1\n", "all: 27\n" ] }, { "output_type": "error", "ename": "KeyError", "evalue": "'happy'", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'the'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'all'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'happy'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sad'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'{}: {}'\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_index\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mKeyError\u001b[0m: 'happy'" ] } ] }, { "cell_type": "code", "source": [ "from keras.preprocessing.sequence import pad_sequences\n", "\n", "maxlen = 100\n", "\n", "X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)\n", "X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)\n", "\n", "print(X_train[0, :])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eLGO0nnVYdsU", "outputId": "1985fce7-a8df-4e6b-f360-2d709ea0c94a" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[170 116 390 35 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", " 0 0 0 0 0 0 0 0 0 0]\n" ] } ] }, { "cell_type": "code", "source": [ "from keras.models import Sequential\n", "from keras import layers\n", "\n", "embedding_dim = 50\n", "\n", "model = Sequential()\n", "model.add(layers.Embedding(input_dim=vocab_size,\n", " output_dim=embedding_dim,\n", " input_length=maxlen))\n", "model.add(layers.Flatten())\n", "model.add(layers.Dense(10, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "model.compile(optimizer='adam',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])\n", "model.summary()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qU9k7VW6Ydpd", "outputId": "bb50341d-f063-4aed-c590-cc941511b852" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " embedding (Embedding) (None, 100, 50) 128750 \n", " \n", " flatten (Flatten) (None, 5000) 0 \n", " \n", " dense (Dense) (None, 10) 50010 \n", " \n", " dense_1 (Dense) (None, 1) 11 \n", " \n", "=================================================================\n", "Total params: 178771 (698.32 KB)\n", "Trainable params: 178771 (698.32 KB)\n", "Non-trainable params: 0 (0.00 Byte)\n", "_________________________________________________________________\n" ] } ] }, { "cell_type": "code", "source": [ "history = model.fit(X_train, y_train,\n", " epochs=20,\n", " verbose=False,\n", " validation_data=(X_test, y_test),\n", " batch_size=10)\n", "loss, accuracy = model.evaluate(X_train, y_train, verbose=False)\n", "print(\"Training Accuracy: {:.4f}\".format(accuracy))\n", "loss, accuracy = model.evaluate(X_test, y_test, verbose=False)\n", "print(\"Testing Accuracy: {:.4f}\".format(accuracy))\n", "plot_history(history)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 504 }, "id": "8xe4JifNYdmV", "outputId": "d4ae1f81-0277-4cff-e6bb-9f71524aeb9b" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training Accuracy: 1.0000\n", "Testing Accuracy: 0.6578\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "from keras.models import Sequential\n", "from keras import layers\n", "\n", "embedding_dim = 50\n", "\n", "model = Sequential()\n", "model.add(layers.Embedding(input_dim=vocab_size,\n", " output_dim=embedding_dim,\n", " input_length=maxlen))\n", "model.add(layers.GlobalMaxPool1D())\n", "model.add(layers.Dense(10, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "model.compile(optimizer='adam',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])\n", "model.summary()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "W7EahZb_Ydjh", "outputId": "b9420673-bcf6-438b-9927-3f8b084cf510" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model: \"sequential_1\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " embedding_1 (Embedding) (None, 100, 50) 128750 \n", " \n", " global_max_pooling1d (Glob (None, 50) 0 \n", " alMaxPooling1D) \n", " \n", " dense_2 (Dense) (None, 10) 510 \n", " \n", " dense_3 (Dense) (None, 1) 11 \n", " \n", "=================================================================\n", "Total params: 129271 (504.96 KB)\n", "Trainable params: 129271 (504.96 KB)\n", "Non-trainable params: 0 (0.00 Byte)\n", "_________________________________________________________________\n" ] } ] }, { "cell_type": "code", "source": [ "history = model.fit(X_train, y_train,\n", " epochs=50,\n", " verbose=False,\n", " validation_data=(X_test, y_test),\n", " batch_size=10)\n", "loss, accuracy = model.evaluate(X_train, y_train, verbose=False)\n", "print(\"Training Accuracy: {:.4f}\".format(accuracy))\n", "loss, accuracy = model.evaluate(X_test, y_test, verbose=False)\n", "print(\"Testing Accuracy: {:.4f}\".format(accuracy))\n", "plot_history(history)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 504 }, "id": "U7Eoz2lRYdg_", "outputId": "d62c5d25-4f88-4919-972b-898f0f3fc405" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training Accuracy: 1.0000\n", "Testing Accuracy: 0.7861\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "\n", "def create_embedding_matrix(filepath, word_index, embedding_dim):\n", " vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index\n", " embedding_matrix = np.zeros((vocab_size, embedding_dim))\n", "\n", " with open(filepath) as f:\n", " for line in f:\n", " word, *vector = line.split()\n", " if word in word_index:\n", " idx = word_index[word]\n", " embedding_matrix[idx] = np.array(\n", " vector, dtype=np.float32)[:embedding_dim]\n", "\n", " return embedding_matrix" ], "metadata": { "id": "HYSFF4iAYdeU" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "embedding_dim = 50\n", "embedding_matrix = create_embedding_matrix(\n", " 'data/glove_word_embeddings/glove.6B.50d.txt',\n", " tokenizer.word_index, embedding_dim)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 315 }, "id": "cgReGk80Ydb-", "outputId": "799a564c-766f-4405-ac9c-212e1a535982" }, "execution_count": null, "outputs": [ { "output_type": "error", "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'data/glove_word_embeddings/glove.6B.50d.txt'", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0membedding_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m50\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m embedding_matrix = create_embedding_matrix(\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m'data/glove_word_embeddings/glove.6B.50d.txt'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m tokenizer.word_index, embedding_dim)\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mcreate_embedding_matrix\u001b[0;34m(filepath, word_index, embedding_dim)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0membedding_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvocab_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0membedding_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mword\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mvector\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/glove_word_embeddings/glove.6B.50d.txt'" ] } ] }, { "cell_type": "code", "source": [ "nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))\n", "nonzero_elements / vocab_size" ], "metadata": { "id": "bmSmdC7aYdZB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = Sequential()\n", "model.add(layers.Embedding(vocab_size, embedding_dim,\n", " weights=[embedding_matrix],\n", " input_length=maxlen,\n", " trainable=False))\n", "model.add(layers.GlobalMaxPool1D())\n", "model.add(layers.Dense(10, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "model.compile(optimizer='adam',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])\n", "model.summary()" ], "metadata": { "id": "ejpqLNpCYdWc" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "history = model.fit(X_train, y_train,\n", " epochs=50,\n", " verbose=False,\n", " validation_data=(X_test, y_test),\n", " batch_size=10)\n", "loss, accuracy = model.evaluate(X_train, y_train, verbose=False)\n", "print(\"Training Accuracy: {:.4f}\".format(accuracy))\n", "loss, accuracy = model.evaluate(X_test, y_test, verbose=False)\n", "print(\"Testing Accuracy: {:.4f}\".format(accuracy))\n", "plot_history(history)" ], "metadata": { "id": "aQGqF2S4YdT5" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "MVOMztAQYdRU" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = Sequential()\n", "model.add(layers.Embedding(vocab_size, embedding_dim,\n", " weights=[embedding_matrix],\n", " input_length=maxlen,\n", " trainable=True))\n", "model.add(layers.GlobalMaxPool1D())\n", "model.add(layers.Dense(10, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "model.compile(optimizer='adam',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])\n", "model.summary()" ], "metadata": { "id": "d_3uK3_Ab_1D" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "history = model.fit(X_train, y_train,\n", " epochs=50,\n", " verbose=False,\n", " validation_data=(X_test, y_test),\n", " batch_size=10)\n", "loss, accuracy = model.evaluate(X_train, y_train, verbose=False)\n", "print(\"Training Accuracy: {:.4f}\".format(accuracy))\n", "loss, accuracy = model.evaluate(X_test, y_test, verbose=False)\n", "print(\"Testing Accuracy: {:.4f}\".format(accuracy))\n", "plot_history(history)" ], "metadata": { "id": "jaXu02YTb_x8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "_qDUq0Hyb_vf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "embedding_dim = 100\n", "\n", "model = Sequential()\n", "model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))\n", "model.add(layers.Conv1D(128, 5, activation='relu'))\n", "model.add(layers.GlobalMaxPooling1D())\n", "model.add(layers.Dense(10, activation='relu'))\n", "model.add(layers.Dense(1, activation='sigmoid'))\n", "model.compile(optimizer='adam',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])\n", "model.summary()" ], "metadata": { "id": "1OQt-Kd-b_tT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "history = model.fit(X_train, y_train,\n", " epochs=10,\n", " verbose=False,\n", " validation_data=(X_test, y_test),\n", " batch_size=10)\n", "loss, accuracy = model.evaluate(X_train, y_train, verbose=False)\n", "print(\"Training Accuracy: {:.4f}\".format(accuracy))\n", "loss, accuracy = model.evaluate(X_test, y_test, verbose=False)\n", "print(\"Testing Accuracy: {:.4f}\".format(accuracy))\n", "plot_history(history)" ], "metadata": { "id": "LRLa7IBSb_qu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):\n", " model = Sequential()\n", " model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))\n", " model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))\n", " model.add(layers.GlobalMaxPooling1D())\n", " model.add(layers.Dense(10, activation='relu'))\n", " model.add(layers.Dense(1, activation='sigmoid'))\n", " model.compile(optimizer='adam',\n", " loss='binary_crossentropy',\n", " metrics=['accuracy'])\n", " return model" ], "metadata": { "id": "ZwQp7R3Qb_oS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "param_grid = dict(num_filters=[32, 64, 128],\n", " kernel_size=[3, 5, 7],\n", " vocab_size=[5000],\n", " embedding_dim=[50],\n", " maxlen=[100])" ], "metadata": { "id": "_5QzQLf-b_l2" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from keras.wrappers.scikit_learn import KerasClassifier\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "# Main settings\n", "epochs = 20\n", "embedding_dim = 50\n", "maxlen = 100\n", "output_file = 'data/output.txt'\n", "\n", "# Run grid search for each source (yelp, amazon, imdb)\n", "for source, frame in df.groupby('source'):\n", " print('Running grid search for data set :', source)\n", " sentences = df['sentence'].values\n", " y = df['label'].values\n", "\n", " # Train-test split\n", " sentences_train, sentences_test, y_train, y_test = train_test_split(\n", " sentences, y, test_size=0.25, random_state=1000)\n", "\n", " # Tokenize words\n", " tokenizer = Tokenizer(num_words=5000)\n", " tokenizer.fit_on_texts(sentences_train)\n", " X_train = tokenizer.texts_to_sequences(sentences_train)\n", " X_test = tokenizer.texts_to_sequences(sentences_test)\n", "\n", " # Adding 1 because of reserved 0 index\n", " vocab_size = len(tokenizer.word_index) + 1\n", "\n", " # Pad sequences with zeros\n", " X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)\n", " X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)\n", "\n", " # Parameter grid for grid search\n", " param_grid = dict(num_filters=[32, 64, 128],\n", " kernel_size=[3, 5, 7],\n", " vocab_size=[vocab_size],\n", " embedding_dim=[embedding_dim],\n", " maxlen=[maxlen])\n", " model = KerasClassifier(build_fn=create_model,\n", " epochs=epochs, batch_size=10,\n", " verbose=False)\n", " grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,\n", " cv=4, verbose=1, n_iter=5)\n", " grid_result = grid.fit(X_train, y_train)\n", "\n", " # Evaluate testing set\n", " test_accuracy = grid.score(X_test, y_test)\n", "\n", " # Save and evaluate results\n", " prompt = input(f'finished {source}; write to file and proceed? [y/n]')\n", " if prompt.lower() not in {'y', 'true', 'yes'}:\n", " break\n", " with open(output_file, 'a') as f:\n", " s = ('Running {} data set\\nBest Accuracy : '\n", " '{:.4f}\\n{}\\nTest Accuracy : {:.4f}\\n\\n')\n", " output_string = s.format(\n", " source,\n", " grid_result.best_score_,\n", " grid_result.best_params_,\n", " test_accuracy)\n", " print(output_string)\n", " f.write(output_string)" ], "metadata": { "id": "t5sqLMvzcLGM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "Ug1wfWVIcLDo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "SGmNXp22cLBD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "xADWnngEcK-x" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "KTSOrnVLcK7l" }, "execution_count": null, "outputs": [] } ] }