{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "accelerator": "GPU", "colab": { "name": "pos-neg-sentiment-model-v1", "provenance": [], "collapsed_sections": [], "include_colab_link": true }, "kernelspec": { "display_name": "Python 3", "name": "python3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "Q4DN769E2O_R" }, "source": [ "# Prepare Environment" ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "zA07b51AGF5l", "outputId": "951aac9f-229c-414b-9728-4fe99df3eaad", "colab": { "base_uri": "https://localhost:8080/", "height": 68 } }, "source": [ "!pip install -q tensorflow-gpu==2.0.0-beta1\n", "# !pip install -q tensorflow-gpu==1.15" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ "\u001b[K |████████████████████████████████| 348.9MB 42kB/s \n", "\u001b[K |████████████████████████████████| 3.1MB 60.1MB/s \n", "\u001b[K |████████████████████████████████| 501kB 57.3MB/s \n", "\u001b[?25h" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "zSeyZMq-BYsu", "outputId": "144d6afa-eca5-451c-b215-ccd8bbdb8301", "colab": { "base_uri": "https://localhost:8080/", "height": 445 } }, "source": [ "import tensorflow as tf\n", "import tensorflow_hub as hub\n", "import numpy as np\n", "import os\n", "from sklearn.metrics import classification_report\n", "from gensim.models import Word2Vec\n", "\n" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "id": "_SF78GWg74cW", "colab_type": "code", "outputId": "dfc359df-b552-462b-85e0-68f2f275c947", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "tf.__version__" ], "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'2.0.0-beta1'" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "code", "metadata": { "id": "RBhFY-4MPLR5", "colab_type": "code", "colab": {} }, "source": [ "# train and save word2vec model, this step can be removed after uploading the trained model to github\n", "# !wget https://github.com/raqueeb/datasets/raw/master/bnwiki-texts.zip\n", "# !unzip bnwiki-texts.zip\n", "\n", "# preprocessed_text_file_path = 'bnwiki-texts-preprocessed.txt'\n", "\n", "# lines_from_file = []\n", "# with open(preprocessed_text_file_path, encoding='utf8') as text_file:\n", "# for line in text_file:\n", "# lines_from_file.append(line)\n", "\n", "# tokenized_lines = []\n", "# for single_line in lines_from_file:\n", "# tokenized_lines.append(single_line.split())\n", "\n", "# model = Word2Vec(tokenized_lines, size=200, window=5, min_count=10)\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "IrOCq-nUSU6U", "colab_type": "code", "colab": {} }, "source": [ "# model.wv.most_similar('ছেলে', topn=5)\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "-pkgmHPTPvKH", "colab_type": "code", "colab": {} }, "source": [ "\n", "# model.wv.save_word2vec_format('bn-wiki-word2vec-300.txt', binary=False)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7i8D8fagTfgR", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "9FB7gLU4F54l" }, "source": [ "# Dataset\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "hqwIWFs_BAvb", "colab_type": "code", "outputId": "3ba187d5-e0cb-42ea-dfe5-1e33b3253369", "colab": { "base_uri": "https://localhost:8080/", "height": 187 } }, "source": [ "!wget http://119.81.77.70:8090/bn-wiki-word2vec-300.txt" ], "execution_count": 6, "outputs": [ { "output_type": "stream", "text": [ "--2019-11-21 05:31:24-- http://119.81.77.70:8090/bn-wiki-word2vec-300.txt\n", "Connecting to 119.81.77.70:8090... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 2496996336 (2.3G) [text/plain]\n", "Saving to: ‘bn-wiki-word2vec-300.txt’\n", "\n", "bn-wiki-word2vec-30 100%[===================>] 2.33G 12.7MB/s in 3m 7s \n", "\n", "2019-11-21 05:34:31 (12.8 MB/s) - ‘bn-wiki-word2vec-300.txt’ saved [2496996336/2496996336]\n", "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "0SA-_IImNerZ", "colab_type": "code", "outputId": "ed80fdbd-f23c-47f5-caba-d13e45e6894a", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "!ls" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "bn-wiki-word2vec-300.txt sample_data\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "5DY5Ze6pO1G5", "outputId": "a8d872cd-217f-4739-9236-0fba4fccc0fd", "colab": { "base_uri": "https://localhost:8080/", "height": 204 } }, "source": [ "!wget https://raw.githubusercontent.com/tensorflow/hub/master/examples/text_embeddings_v2/export_v2.py\n", "# !wget https://raw.githubusercontent.com/tensorflow/hub/master/examples/text_embeddings/export.py\n" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "--2019-11-21 05:34:37-- https://raw.githubusercontent.com/tensorflow/hub/master/examples/text_embeddings_v2/export_v2.py\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 7603 (7.4K) [text/plain]\n", "Saving to: ‘export_v2.py’\n", "\n", "\rexport_v2.py 0%[ ] 0 --.-KB/s \rexport_v2.py 100%[===================>] 7.42K --.-KB/s in 0s \n", "\n", "2019-11-21 05:34:37 (125 MB/s) - ‘export_v2.py’ saved [7603/7603]\n", "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "Tkv5acr_Q9UU", "outputId": "896a6c49-5d52-4be5-ec13-72aad2e9cdb6", "colab": { "base_uri": "https://localhost:8080/", "height": 972 } }, "source": [ "!python export_v2.py --embedding_file=/content/bn-wiki-word2vec-300.txt --export_path=text_module --num_lines_to_ignore=1 \n", "# !python export.py --embedding_file=/content/bn-wiki-word2vec-300.txt --export_path=text_module --num_lines_to_ignore=1 --preprocess_text=True" ], "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", "/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", "tcmalloc: large alloc 1607057408 bytes == 0x8a8f4000 @ 0x7f564d8411e7 0x7f5649de0f71 0x7f5649e4455d 0x7f5649e47e28 0x7f5649e483e5 0x7f5649edefc2 0x50abc5 0x50c549 0x509ce8 0x50aa1d 0x50c549 0x5081d5 0x509647 0x5951c1 0x54a11f 0x551761 0x5aa69c 0x50ab53 0x50c549 0x509ce8 0x50aa1d 0x50c549 0x509ce8 0x50aa1d 0x50c549 0x509ce8 0x50aa1d 0x50c549 0x5081d5 0x50a020 0x50aa1d\n", "2019-11-21 05:38:47.121986: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcuda.so.1\n", "2019-11-21 05:38:47.191968: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", "2019-11-21 05:38:47.192557: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: \n", "name: Tesla P100-PCIE-16GB major: 6 minor: 0 memoryClockRate(GHz): 1.3285\n", "pciBusID: 0000:00:04.0\n", "2019-11-21 05:38:47.193585: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Could not dlopen library 'libcudart.so.10.0'; dlerror: libcudart.so.10.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", "2019-11-21 05:38:47.193815: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Could not dlopen library 'libcublas.so.10.0'; dlerror: libcublas.so.10.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", "2019-11-21 05:38:47.194494: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Could not dlopen library 'libcufft.so.10.0'; dlerror: libcufft.so.10.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", "2019-11-21 05:38:47.194613: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Could not dlopen library 'libcurand.so.10.0'; dlerror: libcurand.so.10.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", "2019-11-21 05:38:47.195317: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Could not dlopen library 'libcusolver.so.10.0'; dlerror: libcusolver.so.10.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", "2019-11-21 05:38:47.196074: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Could not dlopen library 'libcusparse.so.10.0'; dlerror: libcusparse.so.10.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia\n", "2019-11-21 05:38:47.599728: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudnn.so.7\n", "2019-11-21 05:38:47.599791: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1663] Cannot dlopen some GPU libraries. Skipping registering GPU devices...\n", "2019-11-21 05:38:47.622365: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA\n", "2019-11-21 05:38:47.802161: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", "2019-11-21 05:38:47.805560: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x322bf80 executing computations on platform CUDA. Devices:\n", "2019-11-21 05:38:47.805652: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0\n", "2019-11-21 05:38:47.879207: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2000175000 Hz\n", "2019-11-21 05:38:47.879573: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x322d9c0 executing computations on platform Host. Devices:\n", "2019-11-21 05:38:47.879605: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): , \n", "2019-11-21 05:38:47.879684: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1181] Device interconnect StreamExecutor with strength 1 edge matrix:\n", "2019-11-21 05:38:47.879694: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1187] \n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/lookup_ops.py:1159: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "W1121 05:38:48.594012 140008648824704 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/lookup_ops.py:1159: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "INFO:tensorflow:Assets written to: text_module/assets\n", "I1121 05:38:55.261133 140008648824704 builder_impl.py:770] Assets written to: text_module/assets\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "k9WEpmedF_3_", "colab": {} }, "source": [ "module_path = \"text_module\"\n", "embedding_layer = hub.KerasLayer(module_path, trainable=False)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "Z1MBnaBUihWn", "outputId": "fce786ca-3371-4029-8b04-0df935a44e74", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "embedding_layer(['বাস বাস আমার '], ['আমার']).shape" ], "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "TensorShape([1, 300])" ] }, "metadata": { "tags": [] }, "execution_count": 11 } ] }, { "cell_type": "code", "metadata": { "id": "RLqLmkYfUPtl", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "bYv6LqlEChO1", "outputId": "aae2812e-22ab-4085-c5ed-c0c8e72175d8", "colab": { "base_uri": "https://localhost:8080/", "height": 357 } }, "source": [ "!wget http://119.81.77.70:8090/bangla-sentiment.neg\n", "!wget http://119.81.77.70:8090/bangla-sentiment.pos\n", "\n", "all_sentences = []\n", "with open('bangla-sentiment.pos', encoding='utf8') as f:\n", " all_sentences.extend([(line.strip(), 'positive') for line in f])\n", " \n", "with open('bangla-sentiment.neg', encoding='utf8') as f:\n", " all_sentences.extend([(line.strip(), 'negative') for line in f])" ], "execution_count": 12, "outputs": [ { "output_type": "stream", "text": [ "--2019-11-21 05:39:02-- http://119.81.77.70:8090/bangla-sentiment.neg\n", "Connecting to 119.81.77.70:8090... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 363162 (355K) [application/octet-stream]\n", "Saving to: ‘bangla-sentiment.neg’\n", "\n", "bangla-sentiment.ne 100%[===================>] 354.65K 386KB/s in 0.9s \n", "\n", "2019-11-21 05:39:04 (386 KB/s) - ‘bangla-sentiment.neg’ saved [363162/363162]\n", "\n", "--2019-11-21 05:39:05-- http://119.81.77.70:8090/bangla-sentiment.pos\n", "Connecting to 119.81.77.70:8090... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 220062 (215K) [application/octet-stream]\n", "Saving to: ‘bangla-sentiment.pos’\n", "\n", "bangla-sentiment.po 100%[===================>] 214.90K 235KB/s in 0.9s \n", "\n", "2019-11-21 05:39:07 (235 KB/s) - ‘bangla-sentiment.pos’ saved [220062/220062]\n", "\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "8b-UtAP5TL-W" }, "source": [ "We can check the distribution of labels in the training and validation examples after shuffling." ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "4BNXFrkotAYu", "outputId": "0562c847-baf2-47c8-cf90-8ae51d45ef4f", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "pos_count = 0\n", "neg_count = 0\n", "for sentence, label in all_sentences:\n", " if label =='positive':\n", " pos_count +=1\n", " else:\n", " neg_count +=1\n", "print(pos_count)\n", "print(neg_count)" ], "execution_count": 13, "outputs": [ { "output_type": "stream", "text": [ "2039\n", "2520\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "eZRGTzEhUi7Q", "colab": {} }, "source": [ "import random\n", "\n", "def generator():\n", " random.shuffle(all_sentences) \n", " for sentence, label in all_sentences:\n", " if label =='positive':\n", " label = tf.keras.utils.to_categorical(1, num_classes=2)\n", " else:\n", " label = tf.keras.utils.to_categorical(0, num_classes=2)\n", " sentence_tensor = tf.constant(sentence, dtype=tf.dtypes.string)\n", " yield sentence_tensor, label" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "2g4nRflB7fbF", "colab": {} }, "source": [ "def make_dataset(train_size):\n", " data = tf.data.Dataset.from_generator(generator=generator, \n", " output_types=(tf.string, tf.float32))\n", " train_size = 4000\n", " train_data = data.take(train_size)\n", " validation_data = data.skip(train_size)\n", " return train_data, validation_data" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "8PuuN6el8tv9", "colab": { "base_uri": "https://localhost:8080/", "height": 238 }, "outputId": "9db52293-0f89-41a5-e536-559e4bae177d" }, "source": [ "train_data, validation_data = make_dataset(0.80)" ], "execution_count": 16, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/dataset_ops.py:505: py_func (from tensorflow.python.ops.script_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "tf.py_func is deprecated in TF V2. Instead, there are two\n", " options available in V2.\n", " - tf.py_function takes a python function which manipulates tf eager\n", " tensors instead of numpy arrays. It's easy to convert a tf eager tensor to\n", " an ndarray (just call tensor.numpy()) but having access to eager tensors\n", " means `tf.py_function`s can use accelerators such as GPUs as well as\n", " being differentiable using a gradient tape.\n", " - tf.numpy_function maintains the semantics of the deprecated tf.py_func\n", " (it is not differentiable, and manipulates numpy arrays). It drops the\n", " stateful argument making all functions stateful.\n", " \n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "G0CyNOl1yajF", "colab_type": "code", "outputId": "e36d3171-72a0-4236-d435-638f1b52a5e6", "colab": { "base_uri": "https://localhost:8080/", "height": 139 } }, "source": [ "# get a single batch of 2 elements from train_data\n", "next(iter(train_data.batch(2)))" ], "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(, )" ] }, "metadata": { "tags": [] }, "execution_count": 17 } ] }, { "cell_type": "code", "metadata": { "id": "5cRCljsMCDmP", "colab_type": "code", "colab": {} }, "source": [ "sentences_in_a_single_batch, labels_in_a_single_batch = next(iter(train_data.batch(2)))" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "s9yZYRItCNWR", "colab_type": "code", "outputId": "228aefd7-447e-4449-d260-750c79e9a951", "colab": { "base_uri": "https://localhost:8080/", "height": 105 } }, "source": [ "sentences_in_a_single_batch" ], "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "code", "metadata": { "id": "gU2HDStGCbfi", "colab_type": "code", "outputId": "b260ef88-d4b4-40fd-e0ba-eb2ef1af42b3", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "sentences_in_a_single_batch.shape" ], "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "TensorShape([2])" ] }, "metadata": { "tags": [] }, "execution_count": 20 } ] }, { "cell_type": "code", "metadata": { "id": "igZFiVMqChDo", "colab_type": "code", "outputId": "b33cb006-7303-449f-affc-e63b07d3e9b1", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "labels_in_a_single_batch.shape" ], "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "TensorShape([2, 2])" ] }, "metadata": { "tags": [] }, "execution_count": 21 } ] }, { "cell_type": "code", "metadata": { "id": "zFcaGNrIzABb", "colab_type": "code", "colab": {} }, "source": [ "sentence, label = next(iter(train_data.take(1)))" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "eu12m7AS2YVC", "colab_type": "code", "outputId": "321fea7c-5b1d-4937-e83f-b03275ef4d8c", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# numpy() returns the string as bytes. we need to decode it to read it\n", "sentence.numpy().decode('utf8')" ], "execution_count": 23, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'দোয়া করি সুস্থত হয়ে আবার সাভাবিক জিবন ফিরে পাবে'" ] }, "metadata": { "tags": [] }, "execution_count": 23 } ] }, { "cell_type": "code", "metadata": { "id": "q7KcfmLC2ceA", "colab_type": "code", "outputId": "f4b94395-ac34-442d-9e1b-35a27ce95e2b", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# label after converted by to_categorical()\n", "label.numpy() " ], "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([0., 1.], dtype=float32)" ] }, "metadata": { "tags": [] }, "execution_count": 24 } ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "MrdZI6FqPJNP" }, "source": [ "# Model Training and Evaluation" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "WhCqbDK2uUV5" }, "source": [ "## Model" ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "nHUw807XPPM9", "colab": {} }, "source": [ "def create_model():\n", " model = tf.keras.Sequential()\n", " model.add(embedding_layer)\n", " # model.add(tf.keras.layers.Flatten())\n", " # model.add(tf.keras.layers.SpatialDropout1D(0.2))\n", " # model.add(tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))\n", " # model.add(Dense(13, activation='softmax'))\n", " model.add(tf.keras.layers.Dense(256, activation=\"relu\"))\n", " model.add(tf.keras.layers.Dense(128, activation=\"relu\"))\n", " model.add(tf.keras.layers.Dense(2, activation=\"softmax\"))\n", " model.compile(optimizer=\"adam\",loss=\"categorical_crossentropy\",metrics=['acc'])\n", " return model" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "5J4EXJUmPVNG", "colab": {} }, "source": [ "model = create_model()\n", "# Create earlystopping callback\n", "# early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "ZZ7XJLg2u2No" }, "source": [ "## Training" ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "OoBkN2tAaXWD", "outputId": "ffa3f9f1-b24b-44b5-af01-6097c8549981", "colab": { "base_uri": "https://localhost:8080/", "height": 428 } }, "source": [ "batch_size = 256\n", "history = model.fit(train_data.batch(batch_size), \n", " validation_data=validation_data.batch(batch_size), \n", " epochs=10,)" ], "execution_count": 27, "outputs": [ { "output_type": "stream", "text": [ "Epoch 1/10\n", "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "16/16 [==============================] - 16s 1s/step - loss: 0.5320 - acc: 0.6640 - val_loss: 0.0000e+00 - val_acc: 0.0000e+00\n", "Epoch 2/10\n", "16/16 [==============================] - 14s 853ms/step - loss: 0.3657 - acc: 0.8356 - val_loss: 0.3144 - val_acc: 0.8497\n", "Epoch 3/10\n", "16/16 [==============================] - 14s 853ms/step - loss: 0.3059 - acc: 0.8659 - val_loss: 0.2848 - val_acc: 0.8962\n", "Epoch 4/10\n", "16/16 [==============================] - 14s 875ms/step - loss: 0.2651 - acc: 0.8897 - val_loss: 0.2313 - val_acc: 0.9177\n", "Epoch 5/10\n", "16/16 [==============================] - 14s 862ms/step - loss: 0.2218 - acc: 0.9159 - val_loss: 0.1790 - val_acc: 0.9374\n", "Epoch 6/10\n", "16/16 [==============================] - 14s 855ms/step - loss: 0.1933 - acc: 0.9289 - val_loss: 0.2051 - val_acc: 0.9159\n", "Epoch 7/10\n", "16/16 [==============================] - 14s 849ms/step - loss: 0.1654 - acc: 0.9347 - val_loss: 0.1152 - val_acc: 0.9750\n", "Epoch 8/10\n", "16/16 [==============================] - 14s 850ms/step - loss: 0.1372 - acc: 0.9572 - val_loss: 0.1119 - val_acc: 0.9660\n", "Epoch 9/10\n", "16/16 [==============================] - 14s 854ms/step - loss: 0.1095 - acc: 0.9674 - val_loss: 0.0864 - val_acc: 0.9839\n", "Epoch 10/10\n", "16/16 [==============================] - 14s 851ms/step - loss: 0.0925 - acc: 0.9803 - val_loss: 0.0705 - val_acc: 0.9875\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "lrR7SmiTk9t3", "colab_type": "code", "outputId": "15645648-5c3c-48f7-896f-5a0b6c26954c", "colab": { "base_uri": "https://localhost:8080/", "height": 289 } }, "source": [ "model.summary()" ], "execution_count": 28, "outputs": [ { "output_type": "stream", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "keras_layer (KerasLayer) multiple 200881800 \n", "_________________________________________________________________\n", "dense (Dense) multiple 77056 \n", "_________________________________________________________________\n", "dense_1 (Dense) multiple 32896 \n", "_________________________________________________________________\n", "dense_2 (Dense) multiple 258 \n", "=================================================================\n", "Total params: 200,992,010\n", "Trainable params: 110,210\n", "Non-trainable params: 200,881,800\n", "_________________________________________________________________\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "9DeGZFXsJt5g" }, "source": [ "## Saving model\n", "\n", "After training the model we can export it as a [SavedModel](https://www.tensorflow.org/beta/guide/saved_model) to deploy or share with others." ] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "rIO_CseWJtJP", "outputId": "140af371-4342-48b3-b030-a1456ec28a37", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "tf.saved_model.save(model, export_dir=\"my_model\")" ], "execution_count": 29, "outputs": [ { "output_type": "stream", "text": [ "INFO:tensorflow:Assets written to: my_model/assets\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "D54IXLqcG8Cq" }, "source": [ "## Prediction\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "ISbX3GzPoth8", "colab_type": "code", "outputId": "f92fcfae-81cb-46f6-8036-b4d332f5858b", "colab": { "base_uri": "https://localhost:8080/", "height": 204 } }, "source": [ "sents = ['আমরা খুবি খুশি অফারটির জন্য', 'বই পড়তে পছন্দ করি', 'বই পড়তে পছন্দ করি না', 'আমার ভালো লাগছে না', \n", " 'আমার কষ্ট লাগছে', 'এই বইটা বেশ ভালো লাগছে', 'একটা দুর্ঘটনা ঘটে গেল',\n", " 'জিপি আমার প্রিয় নেটওয়ার্ক', 'মোবাইল অপারেটর বেশ টাকা কাটে', 'আমাদের প্রতিদিনের সমস্যা নিয়ে ঝামেলায় আছি',\n", " 'ঢাকা-সিলেটসহ আশপাশের সড়কের যানবাহন চলাচল বন্ধ হয়ে যায়',]\n", "pred_dataset = tf.data.Dataset.from_tensor_slices(sents)\n", "prediction = model.predict(np.array(sents))\n", "\n", "for sentence, pred_sentiment in zip(sents, prediction.argmax(axis=1)):\n", " print(\"Sentence:{} - predicted: {}\".format(sentence, pred_sentiment))" ], "execution_count": 37, "outputs": [ { "output_type": "stream", "text": [ "Sentence:আমরা খুবি খুশি অফারটির জন্য - predicted: 1\n", "Sentence:বই পড়তে পছন্দ করি - predicted: 1\n", "Sentence:বই পড়তে পছন্দ করি না - predicted: 0\n", "Sentence:আমার ভালো লাগছে না - predicted: 0\n", "Sentence:আমার কষ্ট লাগছে - predicted: 0\n", "Sentence:এই বইটা বেশ ভালো লাগছে - predicted: 1\n", "Sentence:একটা দুর্ঘটনা ঘটে গেল - predicted: 0\n", "Sentence:জিপি আমার প্রিয় নেটওয়ার্ক - predicted: 1\n", "Sentence:মোবাইল অপারেটর বেশ টাকা কাটে - predicted: 1\n", "Sentence:আমাদের প্রতিদিনের সমস্যা নিয়ে ঝামেলায় আছি - predicted: 0\n", "Sentence:ঢাকা-সিলেটসহ আশপাশের সড়কের যানবাহন চলাচল বন্ধ হয়ে যায় - predicted: 0\n" ], "name": "stdout" } ] } ] }