{ "cells": [ { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "base_path = './large_dataset/'" ] }, { "cell_type": "code", "execution_count": 188, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bangla_50000-60000_labeled_fixed.csv',\n", " 'bangla_40000-50000_labeled_fixed.csv',\n", " 'bangla_10000-20000_labeled.csv',\n", " 'bangla_100000-120000-labeled.csv',\n", " 'bangla_20000-30000_labeled.csv',\n", " 'bangla_1-10000-labeled-fixed.csv',\n", " 'bangla_60000-80000-labeled.csv',\n", " 'bangla_30000-40000_labeled_fixed.csv']" ] }, "execution_count": 188, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.listdir(base_path)" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [], "source": [ "dfl = []\n", "for fn in os.listdir(base_path):\n", " dfl.append(pd.read_csv(os.path.join(base_path, fn),\n", " names=['sentence_id', 'sentence', 'ignore', 'label']))" ] }, { "cell_type": "code", "execution_count": 190, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentence_idsentenceignorelabel
0301610সময় নষ্ট না করে পেজটি1.0-1
1301611হে মানবজাতি তোমরা কি এখনও বুঝতে পারছনা আমি চা...1.0-1
2301612কয়েক জন মানুষের জন্য এত আবেগ।1.02
3301613আর মায়ানমারে শত শত মানুষ মেরে ফেলা হচ্ছে তার ক...1.0-1
4301614শালারা বিমানে উঠার আগে ড্যান্স পার্টি করে মান...1.0-1
\n", "
" ], "text/plain": [ " sentence_id sentence ignore \\\n", "0 301610 সময় নষ্ট না করে পেজটি 1.0 \n", "1 301611 হে মানবজাতি তোমরা কি এখনও বুঝতে পারছনা আমি চা... 1.0 \n", "2 301612 কয়েক জন মানুষের জন্য এত আবেগ। 1.0 \n", "3 301613 আর মায়ানমারে শত শত মানুষ মেরে ফেলা হচ্ছে তার ক... 1.0 \n", "4 301614 শালারা বিমানে উঠার আগে ড্যান্স পার্টি করে মান... 1.0 \n", "\n", " label \n", "0 -1 \n", "1 -1 \n", "2 2 \n", "3 -1 \n", "4 -1 " ] }, "execution_count": 190, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Merging the dataframes \n", "df = pd.concat(dfl).dropna()\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 191, "metadata": {}, "outputs": [], "source": [ "# pandas drop columns using list of column names\n", "df = df.drop(['ignore'], axis=1)" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [], "source": [ "indexNames = df[(df['label'] == 'ss') |\n", " (df['label'] == 's') |\n", " (df['label'] == '.')].index\n", " \n", "# Delete these row indexes from dataFrame\n", "df.drop(indexNames , inplace=True)" ] }, { "cell_type": "code", "execution_count": 274, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentence_idsentencelabel
9996291605যারা রোহিঙ্গাদের দেশে জায়গা দেওয়ার পক্ষে তাদে...-1
9997291606রোহিঙ্গাদের গণহত্যার বিরুদে পৃথিবীর বিভিন্ন দ...-1
9998291607এই পৃথিবীর সবছেয়ে অমানবিক অশিক্ষিত মুরখ টি দে...-1
9999291608তাদের ভাব দেখে মনে হচ্ছে ভারত এখন আমেরিকা। নি...-1
10000291609ভারতের কেও ভাত পাই কি পায় না সেটা তোকে দেখতে ...5
\n", "
" ], "text/plain": [ " sentence_id sentence label\n", "9996 291605 যারা রোহিঙ্গাদের দেশে জায়গা দেওয়ার পক্ষে তাদে... -1\n", "9997 291606 রোহিঙ্গাদের গণহত্যার বিরুদে পৃথিবীর বিভিন্ন দ... -1\n", "9998 291607 এই পৃথিবীর সবছেয়ে অমানবিক অশিক্ষিত মুরখ টি দে... -1\n", "9999 291608 তাদের ভাব দেখে মনে হচ্ছে ভারত এখন আমেরিকা। নি... -1\n", "10000 291609 ভারতের কেও ভাত পাই কি পায় না সেটা তোকে দেখতে ... 5" ] }, "execution_count": 274, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": 275, "metadata": {}, "outputs": [], "source": [ "df['label'] = df['label'].astype(int)" ] }, { "cell_type": "code", "execution_count": 276, "metadata": {}, "outputs": [], "source": [ "label_map = {-9: 0,\n", " -2: 1,\n", " -1: 2,\n", " 0: 3,\n", " 1: 4,\n", " 2: 5,\n", " 5: 6,\n", " }" ] }, { "cell_type": "code", "execution_count": 277, "metadata": {}, "outputs": [], "source": [ "df['label_enc'] = df['label'].map(label_map)" ] }, { "cell_type": "code", "execution_count": 278, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-1 45833\n", " 2 20535\n", " 0 18950\n", " 1 12743\n", "-2 1214\n", " 5 701\n", "-9 2\n", "Name: label, dtype: int64" ] }, "execution_count": 278, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['label'].value_counts()" ] }, { "cell_type": "code", "execution_count": 279, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2 45833\n", "5 20535\n", "3 18950\n", "4 12743\n", "1 1214\n", "6 701\n", "0 2\n", "Name: label_enc, dtype: int64" ] }, "execution_count": 279, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['label_enc'].value_counts()" ] }, { "cell_type": "code", "execution_count": 280, "metadata": {}, "outputs": [], "source": [ "docs = df['sentence'].values\n", "y = df['label_enc'].values" ] }, { "cell_type": "code", "execution_count": 281, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['সময় নষ্ট না করে পেজটি ',\n", " ' হে মানবজাতি তোমরা কি এখনও বুঝতে পারছনা আমি চাইলে তোমাদের যে কোন মূহুর্তেই মৃত্যুর সাধ গ্রহন করতে হবে।সুতারং সেদিনের কথা স্বরন কর যেদিন তোমাদের কর্ম ছাড়া আর কিছুই কাজে আসবেনা। ',\n", " ' কয়েক জন মানুষের জন্য এত আবেগ। ', ...,\n", " ' এই পৃথিবীর সবছেয়ে অমানবিক অশিক্ষিত মুরখ টি দেশের জনগণ তা হল মায়ানমার এর বৌদ্ধ সম্প্রদায়। ',\n", " ' তাদের ভাব দেখে মনে হচ্ছে ভারত এখন আমেরিকা। নিজের দেশের লোক ভাত পায়না আবার বড় কথা ',\n", " ' ভারতের কেও ভাত পাই কি পায় না সেটা তোকে দেখতে হবে তবে হা কাংলাদেশি তর ভাত দরকার হলে বলিস '],\n", " dtype=object)" ] }, "execution_count": 281, "metadata": {}, "output_type": "execute_result" } ], "source": [ "docs" ] }, { "cell_type": "code", "execution_count": 282, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer" ] }, { "cell_type": "code", "execution_count": 283, "metadata": {}, "outputs": [], "source": [ "vect = TfidfVectorizer(decode_error='ignore',\n", " max_features=20000)\n", "\n", "X = vect.fit_transform(docs)" ] }, { "cell_type": "code", "execution_count": 284, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<99978x9685 sparse matrix of type ''\n", "\twith 805960 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 284, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 285, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)" ] }, { "cell_type": "code", "execution_count": 286, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='warn',\n", " n_jobs=None, penalty='l2', random_state=None, solver='warn',\n", " tol=0.0001, verbose=0, warm_start=False)" ] }, "execution_count": 286, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "model = LogisticRegression(penalty='l2', C=10)\n", "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 287, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5933213305898491" ] }, "execution_count": 287, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.score(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 288, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.5274721610988864" ] }, "execution_count": 288, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 289, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix" ] }, { "cell_type": "code", "execution_count": 290, "metadata": {}, "outputs": [], "source": [ "y_pred = model.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 291, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 61, 243, 24, 2, 31, 0],\n", " [ 30, 11058, 1032, 598, 1065, 2],\n", " [ 5, 3171, 1284, 455, 644, 0],\n", " [ 5, 1780, 426, 1402, 259, 0],\n", " [ 5, 3403, 600, 192, 2015, 3],\n", " [ 1, 157, 8, 17, 15, 1]])" ] }, "execution_count": 291, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confusion_matrix(y_test, y_pred)" ] }, { "cell_type": "code", "execution_count": 292, "metadata": {}, "outputs": [], "source": [ "test_sent = ' ভালবাসার গল্প পড়তে এই পেইজে লাইক দেন বন্ধুত্ব '" ] }, { "cell_type": "code", "execution_count": 293, "metadata": {}, "outputs": [], "source": [ "test_sent_features = vect.transform([test_sent])" ] }, { "cell_type": "code", "execution_count": 294, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([4])" ] }, "execution_count": 294, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.predict(test_sent_features)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Learning curve" ] }, { "cell_type": "code", "execution_count": 295, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import learning_curve" ] }, { "cell_type": "code", "execution_count": 296, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/miniconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:652: Warning: The least populated class in y has only 2 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n", " % (min_groups, self.n_splits)), Warning)\n", "/home/ubuntu/miniconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", " FutureWarning)\n", "/home/ubuntu/miniconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", " \"this warning.\", FutureWarning)\n" ] } ], "source": [ "tsz = np.linspace(0.1, 1, 10)\n", "train_sizes, train_scores, test_scores = learning_curve(model, X, y, train_sizes=tsz, cv=3)" ] }, { "cell_type": "code", "execution_count": 297, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "\n", "fig = plt.figure()\n", "plt.plot(train_sizes, train_scores.mean(axis=1), 'ro-', label=\"Train Scores\")\n", "plt.plot(train_sizes, test_scores.mean(axis=1), 'go-', label=\"Test Scores\")\n", "plt.title('Learning Curve: Logistic Regression')\n", "plt.ylim((0.1, 1.0))\n", "plt.legend()\n", "plt.draw()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### চেষ্টা করে দেখি: NLP with deep learning\n", "আমরা Tokenizer ব্যবহার করছি tensorflow.keras থেকে:\n", "১. ভোকাবুলারি তৈরি করা\n", "২. বাক্যকে সংখ্যার সেকোয়েন্স তৈরি\n", "৩. সেকোয়েন্সকে প্যাড দেয়া যাতে সেটা টেন্সরের মতো ব্যবহার করা যায় pad_sequences ফাংশন ব্যবহার করছি tensorflow.keras থেকে" ] }, { "cell_type": "code", "execution_count": 320, "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.preprocessing.text import Tokenizer" ] }, { "cell_type": "code", "execution_count": 321, "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer(num_words=20000)\n", "\n", "# docs = df['quote']\n", "tokenizer.fit_on_texts(docs)\n", "sequences = tokenizer.texts_to_sequences(docs)" ] }, { "cell_type": "code", "execution_count": 322, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[[107, 562, 1, 3, 3452],\n", " [281,\n", " 276,\n", " 2,\n", " 939,\n", " 512,\n", " 18541,\n", " 18,\n", " 526,\n", " 228,\n", " 16,\n", " 11,\n", " 1013,\n", " 10252,\n", " 1014,\n", " 14,\n", " 12536,\n", " 23,\n", " 8379,\n", " 162,\n", " 2324,\n", " 228,\n", " 1951,\n", " 200,\n", " 4,\n", " 212,\n", " 539,\n", " 7218],\n", " [1043, 208, 110, 7, 111],\n", " [4, 610, 666, 666, 28, 331, 1467, 90, 27, 131, 289, 220],\n", " [1274, 3229, 6507, 76, 3694, 3, 256, 6508, 1035, 4525, 291, 132]]" ] }, "execution_count": 322, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sequences[:5]" ] }, { "cell_type": "code", "execution_count": 323, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "20000" ] }, "execution_count": 323, "metadata": {}, "output_type": "execute_result" } ], "source": [ "max_features = max([max(seq) for seq in sequences if len(seq) > 0]) + 1\n", "max_features" ] }, { "cell_type": "code", "execution_count": 324, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "115" ] }, "execution_count": 324, "metadata": {}, "output_type": "execute_result" } ], "source": [ "maxlen = max([len(seq) for seq in sequences])\n", "maxlen" ] }, { "cell_type": "code", "execution_count": 325, "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.preprocessing.sequence import pad_sequences" ] }, { "cell_type": "code", "execution_count": 352, "metadata": {}, "outputs": [], "source": [ "maxlen=50" ] }, { "cell_type": "code", "execution_count": 353, "metadata": {}, "outputs": [], "source": [ "X = pad_sequences(sequences, maxlen=maxlen)" ] }, { "cell_type": "code", "execution_count": 354, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### আরেকটা নেটওয়ার্ক: recurrent neural network model\n", "- সেন্টিমেন্টকে ক্লাসিফাই করতে রিকারেন্ট নিউরাল নেটওয়ার্কের ব্যবহার " ] }, { "cell_type": "code", "execution_count": 355, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 2, 3, 4, 5, 6])" ] }, "execution_count": 355, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.unique(y_train)" ] }, { "cell_type": "code", "execution_count": 356, "metadata": {}, "outputs": [], "source": [ "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, Activation, Embedding\n", "from tensorflow.keras.layers import LSTM, GRU" ] }, { "cell_type": "code", "execution_count": 357, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "W0913 22:42:56.640506 139675016652544 tf_logging.py:161] : Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.\n" ] } ], "source": [ "model1 = Sequential()\n", "model1.add(Embedding(input_dim=max_features,\n", " output_dim=32,\n", " input_length=maxlen))\n", "model1.add(LSTM(64))\n", "model1.add(Dense(7))\n", "model1.add(Activation('softmax'))\n", "\n", "model1.compile(loss='sparse_categorical_crossentropy',\n", " optimizer='adam',\n", " metrics=['accuracy'])" ] }, { "cell_type": "code", "execution_count": 358, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 62985 samples, validate on 6999 samples\n", "Epoch 1/4\n", "62985/62985 [==============================] - 13s 204us/sample - loss: 0.9779 - accuracy: 0.6240 - val_loss: 0.8152 - val_accuracy: 0.6961\n", "Epoch 2/4\n", "62985/62985 [==============================] - 12s 198us/sample - loss: 0.6876 - accuracy: 0.7444 - val_loss: 0.7712 - val_accuracy: 0.7007\n", "Epoch 3/4\n", "62985/62985 [==============================] - 13s 200us/sample - loss: 0.5782 - accuracy: 0.7907 - val_loss: 0.8043 - val_accuracy: 0.6922\n", "Epoch 4/4\n", "62985/62985 [==============================] - 13s 200us/sample - loss: 0.5088 - accuracy: 0.8189 - val_loss: 0.8484 - val_accuracy: 0.6901\n" ] } ], "source": [ "h = model1.fit(X_train, y_train, batch_size=64, epochs=4, validation_split=0.1)" ] }, { "cell_type": "code", "execution_count": 359, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "69984/69984 [==============================] - 10s 140us/sample - loss: 0.4572 - accuracy: 0.8457\n" ] }, { "data": { "text/plain": [ "[0.4571943279265485, 0.845679]" ] }, "execution_count": 359, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model1.evaluate(X_train, y_train, batch_size=32)" ] }, { "cell_type": "code", "execution_count": 360, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29994/29994 [==============================] - 4s 140us/sample - loss: 0.8291 - accuracy: 0.7025\n" ] }, { "data": { "text/plain": [ "0.7024738" ] }, "execution_count": 360, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loss, acc = model1.evaluate(X_test, y_test, batch_size=32)\n", "acc" ] }, { "cell_type": "code", "execution_count": 361, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 361, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "pd.DataFrame(h.history).plot(ylim=(-0.05, 1.05))" ] }, { "cell_type": "code", "execution_count": 362, "metadata": {}, "outputs": [], "source": [ "y_pred = model1.predict_classes(X_test)" ] }, { "cell_type": "code", "execution_count": 363, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 0, 1, 0, 0, 0, 0],\n", " [ 0, 185, 171, 5, 1, 22, 0],\n", " [ 0, 80, 11492, 992, 458, 772, 1],\n", " [ 0, 2, 2101, 2595, 782, 243, 0],\n", " [ 0, 0, 811, 554, 2189, 108, 2],\n", " [ 0, 15, 1205, 283, 114, 4609, 0],\n", " [ 0, 2, 148, 2, 15, 34, 0]])" ] }, "execution_count": 363, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confusion_matrix(y_test, y_pred)" ] }, { "cell_type": "code", "execution_count": 343, "metadata": {}, "outputs": [], "source": [ "y_pred_all = model1.predict_classes(X)" ] }, { "cell_type": "code", "execution_count": 344, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 0, 1, 0, 0, 1, 0],\n", " [ 0, 807, 318, 13, 11, 65, 0],\n", " [ 0, 184, 41195, 1756, 958, 1738, 2],\n", " [ 0, 6, 4523, 12055, 1640, 726, 0],\n", " [ 0, 4, 1522, 1276, 9670, 270, 1],\n", " [ 0, 47, 2757, 502, 240, 16987, 2],\n", " [ 0, 10, 460, 9, 98, 115, 9]])" ] }, "execution_count": 344, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confusion_matrix(y, y_pred_all)" ] }, { "cell_type": "code", "execution_count": 350, "metadata": {}, "outputs": [], "source": [ "# positive (4) labeled as negative (2)\n", "pos_neg_errors_idx = (y == 4) & (y_pred_all == 4)" ] }, { "cell_type": "code", "execution_count": 351, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([' হে মহান ফুটবলারবৃন্দ তোমাদের স্মরণ গাঁথা কাব্য চিরদিন মনে থাকবে।তোমরা শান্তিতে থেকো। ',\n", " ' নিহত সবার পরিবারে প্রতি রইল অন্তরের অন্তস্থল থেকে গভীর সমবেদনা। ',\n", " ' নিহত সবার পরিবারে প্রতি রইল অন্তরের অন্তস্থল থেকে গভীর সমবেদনা জানাই শোকাহত। ',\n", " ...,\n", " ' ঘুরে আসুন এই পেজ বাল লাগবে কাল্পনিক রাজ্য কাল্পনিক রাজ্য কাল্পনিক রাজ্য কাল্পনিক রাজ্য ',\n", " ' অস্থির একটা ফানি পেজ বাটপার বাটপার বাটপার বাটপার মজা নিতে লাইক দিন বাটপার বাটপার বাটপার বাটপার বাটপার বাটপার বাটপার ',\n", " ' জা করার তারাতারি করেন ৷ '], dtype=object)" ] }, "execution_count": 351, "metadata": {}, "output_type": "execute_result" } ], "source": [ "docs[pos_neg_errors_idx]" ] }, { "cell_type": "code", "execution_count": 337, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "' ভালবাসার গল্প পড়তে এই পেইজে লাইক দেন বন্ধুত্ব '" ] }, "execution_count": 337, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_sent" ] }, { "cell_type": "code", "execution_count": 338, "metadata": {}, "outputs": [], "source": [ "test_sent_numbers = tokenizer.texts_to_sequences([test_sent])" ] }, { "cell_type": "code", "execution_count": 339, "metadata": {}, "outputs": [], "source": [ "test_sent_X = pad_sequences(test_sent_numbers, maxlen=maxlen)" ] }, { "cell_type": "code", "execution_count": 340, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 2184, 780, 1426,\n", " 5, 924, 218, 295, 5132]], dtype=int32)" ] }, "execution_count": 340, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_sent_X" ] }, { "cell_type": "code", "execution_count": 341, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[3.2493996e-04, 1.3393437e-04, 1.4631876e-02, 9.5704895e-01,\n", " 2.3777947e-02, 3.7020321e-03, 3.8020010e-04]], dtype=float32)" ] }, "execution_count": 341, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model1.predict(test_sent_X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### অন্য কিছু দেখি " ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "W0912 23:33:49.180243 140230039140096 tf_logging.py:161] : Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.\n", "W0912 23:33:49.333416 140230039140096 tf_logging.py:161] : Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_5\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding_5 (Embedding) (None, 99, 32) 487424 \n", "_________________________________________________________________\n", "unified_lstm_7 (UnifiedLSTM) (None, 99, 16) 3136 \n", "_________________________________________________________________\n", "unified_lstm_8 (UnifiedLSTM) (None, 8) 800 \n", "_________________________________________________________________\n", "dense_5 (Dense) (None, 1) 9 \n", "_________________________________________________________________\n", "activation_5 (Activation) (None, 1) 0 \n", "=================================================================\n", "Total params: 491,369\n", "Trainable params: 491,369\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ], "source": [ "from tensorflow.keras import regularizers\n", "\n", "model2 = Sequential()\n", "model2.add(Embedding(input_dim=max_features,\n", " output_dim=32,\n", " input_length=maxlen))\n", "model2.add(LSTM(16, return_sequences=True, dropout=0.1))\n", "model2.add(LSTM(8, activity_regularizer=regularizers.l2(0.01), kernel_regularizer=regularizers.l2(0.01)))\n", "model2.add(Dense(1))\n", "model2.add(Activation('sigmoid'))\n", "\n", "model2.compile(loss='binary_crossentropy',\n", " optimizer='adam',\n", " metrics=['accuracy'])\n", "model2.summary()" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train on 2520 samples, validate on 280 samples\n", "Epoch 1/4\n", "2520/2520 [==============================] - 2s 838us/sample - loss: 0.8705 - accuracy: 0.5563 - val_loss: 0.8297 - val_accuracy: 0.6107\n", "Epoch 2/4\n", "2520/2520 [==============================] - 1s 540us/sample - loss: 0.7672 - accuracy: 0.6817 - val_loss: 0.7170 - val_accuracy: 0.6714\n", "Epoch 3/4\n", "2520/2520 [==============================] - 1s 535us/sample - loss: 0.5342 - accuracy: 0.8544 - val_loss: 0.5666 - val_accuracy: 0.8036\n", "Epoch 4/4\n", "2520/2520 [==============================] - 1s 536us/sample - loss: 0.3160 - accuracy: 0.9552 - val_loss: 0.5511 - val_accuracy: 0.8000\n" ] } ], "source": [ "from tensorflow.keras import callbacks\n", "h = model.fit(X_train, y_train, batch_size=64, epochs=4, validation_split=0.1, callbacks=[callbacks.EarlyStopping()])" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2800/2800 [==============================] - 1s 192us/sample - loss: 0.2718 - accuracy: 0.9650\n" ] }, { "data": { "text/plain": [ "[0.2718334950719561, 0.965]" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.evaluate(X_train, y_train, batch_size=64)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1200/1200 [==============================] - 0s 194us/sample - loss: 0.5122 - accuracy: 0.8092\n" ] }, { "data": { "text/plain": [ "0.80916667" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loss, acc = model.evaluate(X_test, y_test, batch_size=64)\n", "acc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ফাস্টটেক্সট ব্যবহার\n", "\n", "ফাস্টটেক্সট এমবেডিং ব্যবহার করি " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }