{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyOdNY+9wuYsjywlW4gTb+x4",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"source": [
"https://archive.ics.uci.edu/dataset/331/sentiment+labelled+sentences\n"
],
"metadata": {
"id": "sp2b6deAWEAi"
}
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"filepath_dict = {'yelp': 'https://raw.githubusercontent.com/atharvajk98/UCI-Sentiment-Analysis/master/Dataset/yelp_labelled.txt',\n",
" 'amazon': 'https://raw.githubusercontent.com/atharvajk98/UCI-Sentiment-Analysis/master/Dataset/amazon_cells_labelled.txt',\n",
" 'imdb': 'https://raw.githubusercontent.com/atharvajk98/UCI-Sentiment-Analysis/master/Dataset/imdb_labelled.txt'}\n",
"\n",
"df_list = []\n",
"for source, filepath in filepath_dict.items():\n",
" df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\\t')\n",
" df['source'] = source # Add another column filled with the source name\n",
" df_list.append(df)\n",
"\n",
"df = pd.concat(df_list)\n",
"print(df.iloc[0])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oq-C9PwYWDom",
"outputId": "7e32fe27-6892-43e6-8aef-e071d65aef22"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"sentence Wow... Loved this place.\n",
"label 1\n",
"source yelp\n",
"Name: 0, dtype: object\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Qd2nhwJZV9-F"
},
"outputs": [],
"source": [
"sentences = ['John likes ice cream', 'John hates chocolate.']"
]
},
{
"cell_type": "code",
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"vectorizer = CountVectorizer(min_df=0, lowercase=False)\n",
"vectorizer.fit(sentences)\n",
"vectorizer.vocabulary_"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6H_HQ7dAWDZu",
"outputId": "06b85cb8-ce3f-4060-9d09-40d75eabfeb0"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "OAYTiaXdWDW2"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"vectorizer.transform(sentences).toarray()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MREny8T_WDT-",
"outputId": "7756d8d8-6313-4c20-bb29-41e800bc0edd"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[1, 0, 1, 0, 1, 1],\n",
" [1, 1, 0, 1, 0, 0]])"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"df_yelp = df[df['source'] == 'yelp']\n",
"\n",
"sentences = df_yelp['sentence'].values\n",
"y = df_yelp['label'].values\n",
"\n",
"sentences_train, sentences_test, y_train, y_test = train_test_split(\n",
" sentences, y, test_size=0.25, random_state=1000)"
],
"metadata": {
"id": "QgR7b87UWDOT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"vectorizer = CountVectorizer()\n",
"vectorizer.fit(sentences_train)\n",
"\n",
"X_train = vectorizer.transform(sentences_train)\n",
"X_test = vectorizer.transform(sentences_test)\n",
"X_train"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xL3fcoLaXb0K",
"outputId": "15ada9b5-4983-4a7f-8dea-7efe8ac03da0"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<750x1714 sparse matrix of type ''\n",
"\twith 7368 stored elements in Compressed Sparse Row format>"
]
},
"metadata": {},
"execution_count": 11
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"classifier = LogisticRegression()\n",
"classifier.fit(X_train, y_train)\n",
"score = classifier.score(X_test, y_test)\n",
"\n",
"print(\"Accuracy:\", score)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "f_X_B-NAXbxP",
"outputId": "25c31651-9ad4-41b7-c35b-656273f81e60"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy: 0.796\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"for source in df['source'].unique():\n",
" df_source = df[df['source'] == source]\n",
" sentences = df_source['sentence'].values\n",
" y = df_source['label'].values\n",
"\n",
" sentences_train, sentences_test, y_train, y_test = train_test_split(\n",
" sentences, y, test_size=0.25, random_state=1000)\n",
"\n",
" vectorizer = CountVectorizer()\n",
" vectorizer.fit(sentences_train)\n",
" X_train = vectorizer.transform(sentences_train)\n",
" X_test = vectorizer.transform(sentences_test)\n",
"\n",
" classifier = LogisticRegression()\n",
" classifier.fit(X_train, y_train)\n",
" score = classifier.score(X_test, y_test)\n",
" print('Accuracy for {} data: {:.4f}'.format(source, score))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9n_E8ZTtXbup",
"outputId": "30db96fa-2bec-4456-c642-cfb3f2118cd6"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy for yelp data: 0.7960\n",
"Accuracy for amazon data: 0.7960\n",
"Accuracy for imdb data: 0.7487\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from keras.models import Sequential\n",
"from keras import layers\n",
"\n",
"input_dim = X_train.shape[1] # Number of features\n",
"\n",
"model = Sequential()\n",
"model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))\n",
"model.add(layers.Dense(1, activation='sigmoid'))"
],
"metadata": {
"id": "IMBW44G8Xbrt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.compile(loss='binary_crossentropy',\n",
" optimizer='adam',\n",
" metrics=['accuracy'])\n",
"model.summary()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kx5mZB4eXbpB",
"outputId": "6796a2e5-fde6-4dfc-d180-5ffbda63222d"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Model: \"sequential\"\n",
"_________________________________________________________________\n",
" Layer (type) Output Shape Param # \n",
"=================================================================\n",
" dense (Dense) (None, 10) 25060 \n",
" \n",
" dense_1 (Dense) (None, 1) 11 \n",
" \n",
"=================================================================\n",
"Total params: 25071 (97.93 KB)\n",
"Trainable params: 25071 (97.93 KB)\n",
"Non-trainable params: 0 (0.00 Byte)\n",
"_________________________________________________________________\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"history = model.fit(X_train, y_train,\n",
" epochs=100,\n",
" verbose=False,\n",
" validation_data=(X_test, y_test),\n",
" batch_size=10)"
],
"metadata": {
"id": "iVVg-k5EXbmC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from keras.backend import clear_session\n",
"clear_session()"
],
"metadata": {
"id": "BwSAo1IuXbje"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"loss, accuracy = model.evaluate(X_train, y_train, verbose=False)\n",
"print(\"Training Accuracy: {:.4f}\".format(accuracy))\n",
"loss, accuracy = model.evaluate(X_test, y_test, verbose=False)\n",
"print(\"Testing Accuracy: {:.4f}\".format(accuracy))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3vkG7Dj8Xbgk",
"outputId": "625575b7-194f-4c64-b186-b200f9523c16"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Training Accuracy: 1.0000\n",
"Testing Accuracy: 0.7754\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import matplotlib.pyplot as plt\n",
"plt.style.use('ggplot')\n",
"\n",
"def plot_history(history):\n",
" acc = history.history['accuracy']\n",
" val_acc = history.history['val_accuracy']\n",
" loss = history.history['loss']\n",
" val_loss = history.history['val_loss']\n",
" x = range(1, len(acc) + 1)\n",
"\n",
" plt.figure(figsize=(12, 5))\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(x, acc, 'b', label='Training acc')\n",
" plt.plot(x, val_acc, 'r', label='Validation acc')\n",
" plt.title('Training and validation accuracy')\n",
" plt.legend()\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(x, loss, 'b', label='Training loss')\n",
" plt.plot(x, val_loss, 'r', label='Validation loss')\n",
" plt.title('Training and validation loss')\n",
" plt.legend()"
],
"metadata": {
"id": "YYDDBMfVXbdi"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"plot_history(history)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 469
},
"id": "dbJDJBA9Xbat",
"outputId": "0f3ba545-2b4a-416f-a6f3-94ff5a117892"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"