{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-06-sarcasm-tf.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/DAF0AC92369C4F74A4AAA2AE089DFDB2.ipynb","timestamp":1644441239012}],"collapsed_sections":[],"toc_visible":true,"mount_file_id":"1JNj0t5pB0ZrfzkQziWwNsgsSNvdlk5q2","authorship_tag":"ABX9TyOZveElLkATPIa8VbAr4Z4n"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","source":["# Sarcasm Detection in Tensorflow"],"metadata":{"id":"HBF2okXCVjmw"}},{"cell_type":"code","metadata":{"id":"kuk7dT6r_vWc"},"source":["path = '/content/drive/My Drive/Playground/DAF0AC92369C4F74A4AAA2AE089DFDB2'"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"q3o9tg-NAZSU"},"source":["!unzip '/content/drive/My Drive/Playground/DAF0AC92369C4F74A4AAA2AE089DFDB2/Sarcasm_Headlines_Dataset_v2.zip'"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"N47CVoQTAgdg"},"source":["import os\n","import re\n","import math\n","import numpy as np\n","import pandas as pd\n","from tqdm import tqdm\n","import seaborn as sns\n","import matplotlib.pyplot as plt\n","\n","import nltk\n","nltk.download('stopwords')\n","nltk.download('wordnet')\n","from nltk.corpus import stopwords\n","from nltk.stem.porter import PorterStemmer \n","from nltk.stem.snowball import SnowballStemmer\n","from nltk.stem.wordnet import WordNetLemmatizer\n","\n","from gensim.models import KeyedVectors\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")\n","\n","tqdm.pandas()\n","%reload_ext autoreload\n","%autoreload 2\n","%reload_ext google.colab.data_table\n","%config InlineBackend.figure_format = 'retina'\n","\n","plt.style.use('fivethirtyeight')\n","plt.style.use('seaborn-notebook')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"bcCadMJZYp24"},"source":["import tensorflow as tf\n","import tensorflow.keras as keras \n","from keras.models import Sequential, Model \n","from keras import layers\n","from keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"MHXMcep9BN8D","colab":{"base_uri":"https://localhost:8080/","height":274},"executionInfo":{"status":"ok","timestamp":1596285517595,"user_tz":-330,"elapsed":1036,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"52791d7c-e721-4e69-b3e2-00b5ccb580d0"},"source":["def parse_data(file):\n"," for l in open(file,'r'):\n"," yield json.loads(l)\n","\n","data = list(parse_data('Sarcasm_Headlines_Dataset_v2.json'))\n","df = pd.DataFrame(data)\n","df.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"application/vnd.google.colaboratory.module+javascript":"\n import \"https://ssl.gstatic.com/colaboratory/data_table/a6224c040fa35dcf/data_table.js\";\n\n window.createDataTable({\n data: [[{\n 'v': 0,\n 'f': \"0\",\n },\n{\n 'v': 1,\n 'f': \"1\",\n },\n\"thirtysomething scientists unveil doomsday clock of hair loss\",\n\"https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205\"],\n [{\n 'v': 1,\n 'f': \"1\",\n },\n{\n 'v': 0,\n 'f': \"0\",\n },\n\"dem rep. totally nails why congress is falling short on gender, racial equality\",\n\"https://www.huffingtonpost.com/entry/donna-edwards-inequality_us_57455f7fe4b055bb1170b207\"],\n [{\n 'v': 2,\n 'f': \"2\",\n },\n{\n 'v': 0,\n 'f': \"0\",\n },\n\"eat your veggies: 9 deliciously different recipes\",\n\"https://www.huffingtonpost.com/entry/eat-your-veggies-9-delici_b_8899742.html\"],\n [{\n 'v': 3,\n 'f': \"3\",\n },\n{\n 'v': 1,\n 'f': \"1\",\n },\n\"inclement weather prevents liar from getting to work\",\n\"https://local.theonion.com/inclement-weather-prevents-liar-from-getting-to-work-1819576031\"],\n [{\n 'v': 4,\n 'f': \"4\",\n },\n{\n 'v': 1,\n 'f': \"1\",\n },\n\"mother comes pretty close to using word 'streaming' correctly\",\n\"https://www.theonion.com/mother-comes-pretty-close-to-using-word-streaming-cor-1819575546\"]],\n columns: [[\"number\", \"index\"], [\"number\", \"is_sarcastic\"], [\"string\", \"headline\"], [\"string\", \"article_link\"]],\n columnOptions: [{\"width\": \"1px\", \"className\": \"index_column\"}],\n rowsPerPage: 25,\n helpUrl: \"https://colab.research.google.com/notebooks/data_table.ipynb\",\n suppressOutputScrolling: true,\n minimumWidth: undefined,\n });\n ","text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
is_sarcasticheadlinearticle_link
01thirtysomething scientists unveil doomsday clo...https://www.theonion.com/thirtysomething-scien...
10dem rep. totally nails why congress is falling...https://www.huffingtonpost.com/entry/donna-edw...
20eat your veggies: 9 deliciously different recipeshttps://www.huffingtonpost.com/entry/eat-your-...
31inclement weather prevents liar from getting t...https://local.theonion.com/inclement-weather-p...
41mother comes pretty close to using word 'strea...https://www.theonion.com/mother-comes-pretty-c...
\n","
"],"text/plain":[" is_sarcastic ... article_link\n","0 1 ... https://www.theonion.com/thirtysomething-scien...\n","1 0 ... https://www.huffingtonpost.com/entry/donna-edw...\n","2 0 ... https://www.huffingtonpost.com/entry/eat-your-...\n","3 1 ... https://local.theonion.com/inclement-weather-p...\n","4 1 ... https://www.theonion.com/mother-comes-pretty-c...\n","\n","[5 rows x 3 columns]"]},"metadata":{"tags":[]},"execution_count":13}]},{"cell_type":"code","metadata":{"id":"ZDRdFV7zX2CX","colab":{"base_uri":"https://localhost:8080/","height":68},"executionInfo":{"status":"ok","timestamp":1596291258441,"user_tz":-330,"elapsed":621,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"524468fa-73ca-42a1-a8b1-7a98bbc6dd6c"},"source":["df.is_sarcastic.value_counts()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0 14985\n","1 13634\n","Name: is_sarcastic, dtype: int64"]},"metadata":{"tags":[]},"execution_count":29}]},{"cell_type":"code","metadata":{"id":"t2hsa7nsBZ96"},"source":["def clean_text(corpus):\n"," cleaned_corpus = pd.Series()\n"," for row in corpus:\n"," qs = []\n"," for word in row.split():\n"," p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n"," p1 = p1.lower()\n"," qs.append(p1)\n"," cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n"," return cleaned_corpus\n","\n","def stopwords_removal(corpus):\n"," stop = set(stopwords.words('english'))\n"," corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n"," return corpus\n"," \n","def lemmatize(corpus):\n"," lem = WordNetLemmatizer()\n"," corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n"," return corpus\n","\n","def stem(corpus, stem_type = None):\n"," if stem_type == 'snowball':\n"," stemmer = SnowballStemmer(language = 'english')\n"," corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n"," else :\n"," stemmer = PorterStemmer()\n"," corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n"," return corpus\n","\n","def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n"," if cleaning:\n"," corpus = clean_text(corpus)\n"," if remove_stopwords:\n"," corpus = stopwords_removal(corpus)\n"," else:\n"," corpus = [[x for x in x.split()] for x in corpus]\n"," if lemmatization:\n"," corpus = lemmatize(corpus)\n"," if stemming == True:\n"," corpus = stem(corpus, stem_type)\n"," corpus = [' '.join(x) for x in corpus]\n"," return corpus"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"sVdI4HPlMQ05"},"source":["headlines = preprocess(df['headline'], lemmatization = True, remove_stopwords = True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"ykAnD74LNPil"},"source":["# !wget -c \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\"\n","model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"KJqTd_9NUnZU"},"source":["MAX_LENGTH = 10\n","VECTOR_SIZE = 300\n","\n","def vectorize_data(data):\n"," vectors = []\n"," padding_vector = [0.0] * VECTOR_SIZE\n"," for i, data_point in enumerate(data):\n"," data_point_vectors = []\n"," count = 0\n"," tokens = data_point.split()\n"," for token in tokens:\n"," if count >= MAX_LENGTH:\n"," break\n"," if token in model.wv.vocab:\n"," data_point_vectors.append(model.wv[token])\n"," count+=1\n"," if len(data_point_vectors) < MAX_LENGTH:\n"," to_fill = MAX_LENGTH - len(data_point_vectors)\n"," for _ in range(to_fill):\n"," data_point_vectors.append(padding_vector)\n"," vectors.append(data_point_vectors)\n"," return vectors"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"fQtDJFOEXXSz"},"source":["vectorized_headlines = vectorize_data(headlines)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"FpZHrheZXscw","colab":{"base_uri":"https://localhost:8080/","height":85},"executionInfo":{"status":"ok","timestamp":1596291394447,"user_tz":-330,"elapsed":2363,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"76c315c5-2580-4234-bfca-4891dad600e3"},"source":["train_div = math.floor(0.7 * len(vectorized_headlines))\n","\n","X_train = vectorized_headlines[:train_div]\n","y_train = df['is_sarcastic'][:train_div]\n","X_test = vectorized_headlines[train_div:]\n","y_test = df['is_sarcastic'][train_div:]\n","\n","print('The size of X_train is:', len(X_train), '\\nThe size of y_train is:', len(y_train),\n"," '\\nThe size of X_test is:', len(X_test), '\\nThe size of y_test is:', len(y_test))\n","\n","X_train = np.reshape(X_train, (len(X_train), MAX_LENGTH, VECTOR_SIZE))\n","X_test = np.reshape(X_test, (len(X_test), MAX_LENGTH, VECTOR_SIZE))\n","y_train = np.array(y_train)\n","y_test = np.array(y_test)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["The size of X_train is: 20033 \n","The size of y_train is: 20033 \n","The size of X_test is: 8586 \n","The size of y_test is: 8586\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"wOkQbMnNYWQb"},"source":["FILTERS=8\n","KERNEL_SIZE=3\n","HIDDEN_LAYER_1_NODES=10\n","HIDDEN_LAYER_2_NODES=5\n","DROPOUT_PROB=0.35\n","NUM_EPOCHS=10\n","BATCH_SIZE=50"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"sx7xixBXYc27","colab":{"base_uri":"https://localhost:8080/","height":408},"executionInfo":{"status":"ok","timestamp":1596291498917,"user_tz":-330,"elapsed":1332,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"51e19883-12c6-4fd2-d3d7-bd36d5ec7546"},"source":["model = Sequential()\n","\n","model.add(Conv1D(FILTERS,\n"," KERNEL_SIZE,\n"," padding='same',\n"," strides=1,\n"," activation='relu', \n"," input_shape = (MAX_LENGTH, VECTOR_SIZE)))\n","model.add(GlobalMaxPooling1D())\n","model.add(Dense(HIDDEN_LAYER_1_NODES, activation='relu'))\n","model.add(Dropout(DROPOUT_PROB))\n","model.add(Dense(HIDDEN_LAYER_2_NODES, activation='relu'))\n","model.add(Dropout(DROPOUT_PROB))\n","model.add(Dense(1, activation='sigmoid'))\n","print(model.summary())"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Model: \"sequential_1\"\n","_________________________________________________________________\n","Layer (type) Output Shape Param # \n","=================================================================\n","conv1d_1 (Conv1D) (None, 10, 8) 7208 \n","_________________________________________________________________\n","global_max_pooling1d_1 (Glob (None, 8) 0 \n","_________________________________________________________________\n","dense_1 (Dense) (None, 10) 90 \n","_________________________________________________________________\n","dropout_1 (Dropout) (None, 10) 0 \n","_________________________________________________________________\n","dense_2 (Dense) (None, 5) 55 \n","_________________________________________________________________\n","dropout_2 (Dropout) (None, 5) 0 \n","_________________________________________________________________\n","dense_3 (Dense) (None, 1) 6 \n","=================================================================\n","Total params: 7,359\n","Trainable params: 7,359\n","Non-trainable params: 0\n","_________________________________________________________________\n","None\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"uUP9uBLZYk5x"},"source":["model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"cRkhMFnGY2hg","colab":{"base_uri":"https://localhost:8080/","height":357},"executionInfo":{"status":"ok","timestamp":1596291543542,"user_tz":-330,"elapsed":25614,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"e8e28a44-6946-4982-a852-3536e5e6b30d"},"source":["training_history = model.fit(X_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Epoch 1/10\n","20033/20033 [==============================] - 3s 143us/step - loss: 0.6554 - accuracy: 0.5961\n","Epoch 2/10\n","20033/20033 [==============================] - 2s 118us/step - loss: 0.5766 - accuracy: 0.6954\n","Epoch 3/10\n","20033/20033 [==============================] - 2s 118us/step - loss: 0.5371 - accuracy: 0.7318\n","Epoch 4/10\n","20033/20033 [==============================] - 2s 117us/step - loss: 0.5071 - accuracy: 0.7501\n","Epoch 5/10\n","20033/20033 [==============================] - 2s 118us/step - loss: 0.4790 - accuracy: 0.7658\n","Epoch 6/10\n","20033/20033 [==============================] - 2s 118us/step - loss: 0.4640 - accuracy: 0.7804\n","Epoch 7/10\n","20033/20033 [==============================] - 2s 119us/step - loss: 0.4421 - accuracy: 0.7919\n","Epoch 8/10\n","20033/20033 [==============================] - 2s 114us/step - loss: 0.4265 - accuracy: 0.8007\n","Epoch 9/10\n","20033/20033 [==============================] - 2s 115us/step - loss: 0.4081 - accuracy: 0.8050\n","Epoch 10/10\n","20033/20033 [==============================] - 2s 115us/step - loss: 0.3939 - accuracy: 0.8148\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"VYOJMjxrY41d","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1596291557432,"user_tz":-330,"elapsed":1539,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"outputId":"485bbc10-f40e-4e1d-93b6-ca47e8680e4d"},"source":["loss, accuracy = model.evaluate(X_test, y_test, verbose=False)\n","print(\"Testing Accuracy: {:.4f}\".format(accuracy))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Testing Accuracy: 0.7600\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"VexECTPzZCGW"},"source":["model_structure = model.to_json()\n","with open(\"sarcasm_detection_model_cnn.json\", \"w\") as json_file:\n"," json_file.write(model_structure)\n","model.save_weights(\"sarcasm_detection_model_cnn.h5\")"],"execution_count":null,"outputs":[]}]}