{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "'''\n", "\n", "This script reads the following inputs.\n", "- Our processed tweets ie (O/P of 02_Append_All_Tweets_together.ipynb)\n", "- Readily available humor corpus(positive as well as negative class) - Oneliners, Proverbs, Wikipedia sentences, Reuters news headlines. Found at https://github.com/CrowdTruth/Short-Text-Corpus-For-Humor-Detection\n", "\n", "It creates a final dataframe with (sentence,class) columns that we will use later for modelling.\n", "'''\n", "\n", "#Import required libraries\n", "import pandas as pd\n", "import random\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "\n", "#Read in tweets. \n", "tweets = pd.read_csv('datasets/Tweets_combined_20190714.csv')\n", "tweets = tweets['tweet_text']\n", "\n", "#Also read in additional humor corpus found on https://github.com/CrowdTruth/Short-Text-Corpus-For-Humor-Detection\n", "oneliners = pd.read_pickle('datasets/humorous_oneliners.pickle')\n", "proverbs = pd.read_pickle('datasets/proverbs.pickle')\n", "wiki = pd.read_pickle('datasets/wiki_sentences.pickle')\n", "reuters = pd.read_pickle('datasets/reuters_headlines.pickle')\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "#Note - We will treat postive class as 1) our tweets & 2) oneliners from humor corpus\n", "#Define postive and negative class labels\n", "oneliner_record = [(sent, 1) for sent in oneliners]\n", "tweet_record = [(tweet_text, 1) for tweet_text in tweets]\n", "proverb_record = [(sent, 0) for sent in proverbs]\n", "wiki_record = [(sent, 0) for sent in wiki]\n", "reuter_record = [(sent, 0) for sent in reuters]\n", "\n", "\n", "\n", "positive_record = oneliner_record + tweet_record\n", "negative_record = wiki_record + proverb_record + reuter_record\n", "columns = ['sentence', 'class']\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#We have more negative class records. Just chose n where n=positive class records. We'll model this as balanced dataset\n", "random.shuffle(negative_record)\n", "negative_record = negative_record[:len(positive_record)]\n", "\n", "#Create dataframe for modelling\n", "df_record = positive_record + negative_record \n", "df = pd.DataFrame(df_record, columns=columns)\n", "\n", "#Randomly shuffle it & reset the index\n", "df = df.sample(frac=1).reset_index(drop=True)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#Save this. It will be used for modeliing in next script\n", "df.to_csv('datasets/Final_data.csv',index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.6" } }, "nbformat": 4, "nbformat_minor": 2 }