{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "wikipedia_embeddings.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.8"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/raqueeb/TensorFlow2/blob/master/wikipedia_embeddings.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QEx9dAerBzOm",
        "colab_type": "text"
      },
      "source": [
        ""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ew82tJs67G1R",
        "colab_type": "text"
      },
      "source": [
        "আমাদের এই টেক্সট ফাইলটা দেখে নিতে পারেন। প্রতিটা লাইনে একটা করে বাংলা বাক্য আছে। বাংলা উইকিপিডিয়া থেকে নেয়া। প্রি-প্রসেসিং করে নিয়েছি আগেই। এখানে সহায়তা দিয়েছেন তারেক আল মুনতাসির। আমাদের এই টেক্সট ফাইলটা দেখে নিতে পারেন। প্রতিটা লাইনে একটা করে বাংলা বাক্য আছে। বাংলা উইকিপিডিয়া থেকে নেয়া। প্রি-প্রসেসিং করে নিয়েছি আগেই। এখানে সহায়তা দিয়েছেন তারেক আল মুনতাসির। আঅ"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "css35ZIlIesJ",
        "colab": {}
      },
      "source": [
        "import re\n",
        "import os\n",
        "import glob\n",
        "import string\n",
        "\n",
        "## দুটো মডেল দেখি 'জেনসিম' ফ্রেমওয়ার্ক দিয়ে\n",
        "from gensim.models import Word2Vec\n",
        "from gensim.models import FastText"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EeyJuXl_KAzj",
        "colab_type": "code",
        "outputId": "34082019-c63f-4a57-ebd2-d4fa9f80aac3",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        }
      },
      "source": [
        "!wget https://media.githubusercontent.com/media/raqueeb/datasets/master/bnwiki-texts.zip"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2019-11-02 11:30:58--  https://media.githubusercontent.com/media/raqueeb/datasets/master/bnwiki-texts.zip\n",
            "Resolving media.githubusercontent.com (media.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
            "Connecting to media.githubusercontent.com (media.githubusercontent.com)|151.101.0.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 61696513 (59M) [application/zip]\n",
            "Saving to: ‘bnwiki-texts.zip’\n",
            "\n",
            "\rbnwiki-texts.zip      0%[                    ]       0  --.-KB/s               \rbnwiki-texts.zip     25%[====>               ]  14.99M  74.9MB/s               \rbnwiki-texts.zip     66%[============>       ]  39.30M  97.9MB/s               \rbnwiki-texts.zip    100%[===================>]  58.84M   105MB/s    in 0.6s    \n",
            "\n",
            "2019-11-02 11:31:00 (105 MB/s) - ‘bnwiki-texts.zip’ saved [61696513/61696513]\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "f0UbGF4HNDM7",
        "colab_type": "code",
        "outputId": "c0613d7f-eff0-47f3-cab0-23f49c42771a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        }
      },
      "source": [
        "!ls -al"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "total 60268\n",
            "drwxr-xr-x 1 root root     4096 Nov  2 11:30 .\n",
            "drwxr-xr-x 1 root root     4096 Nov  2 11:24 ..\n",
            "-rw-r--r-- 1 root root 61696513 Nov  2 11:30 bnwiki-texts.zip\n",
            "drwxr-xr-x 1 root root     4096 Oct 30 15:14 .config\n",
            "drwxr-xr-x 1 root root     4096 Oct 25 16:58 sample_data\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eqj1LzMqygcn",
        "colab_type": "code",
        "outputId": "d5610fef-23c4-43f7-c4c1-2b7f1285e4f4",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "!unzip bnwiki-texts.zip"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Archive:  bnwiki-texts.zip\n",
            "  inflating: bnwiki-texts-preprocessed.txt  \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lw0wsKOtOJhC",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import warnings\n",
        "warnings.filterwarnings(\"ignore\")"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kUES-_k9vAI2",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "preprocessed_text_file_path = 'bnwiki-texts-preprocessed.txt'"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "mmcRCrb8CbjQ",
        "colab_type": "text"
      },
      "source": [
        ""
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wpbNqMMjvAI-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "lines_from_file = []\n",
        "with open(preprocessed_text_file_path, encoding='utf8') as text_file:\n",
        "    for line in text_file:\n",
        "        lines_from_file.append(line)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0629wIhUeo4U",
        "colab_type": "text"
      },
      "source": [
        "কতগুলো লাইন আছে এই ফাইলে?"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0wD_U3zBaqnv",
        "colab_type": "code",
        "outputId": "d32165b4-9176-44d5-8839-c0b45e3011f5",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "len(lines_from_file)"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1363435"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hH9Y4ylRvAJG",
        "colab_type": "text"
      },
      "source": [
        "আমাদের জেনসিম ওয়ার্ড২ভেক এবং ফাস্টটেক্সট শব্দের লিস্ট আশা করে একেকটা বাক্য/প্রতিটা লাইনে। যেমন, [[\"আমি\", \"এখন\", \"বইটি\", \"পরছি\"],[\"বইটি\", \"অনেক\", \"ভাল\"]]\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9GVytXpevAJI",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "tokenized_lines = []\n",
        "for single_line in lines_from_file:\n",
        "    tokenized_lines.append(single_line.split())"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Pyfc1xS6vAJN",
        "colab_type": "code",
        "outputId": "5bfe5f72-a5c9-4828-eb36-561e91ab285b",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "print(len(tokenized_lines))"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "1363435\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "LOSqlC-5Ies-",
        "outputId": "cda2210a-8d50-4fbc-9b39-e854330d98ad",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        }
      },
      "source": [
        "print(tokenized_lines[0])"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['বাংলা', 'ভাষা', 'বাংলা', 'ভাষা', 'বাংলা', 'ভাষা', 'বাঙলা', 'বাঙ্গলা', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', 'একটি', 'ইন্দো', 'আর্য', 'ভাষা', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা']\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "qK9XVTfWIetC",
        "colab": {}
      },
      "source": [
        "model = Word2Vec(tokenized_lines, size=200, window=5, min_count=10)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "F6MgKNt6EOzo",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "fasttext_model = FastText(tokenized_lines, size=200, window=5, min_count=10)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "outputId": "476a04be-5c7a-4346-fab7-a49636464d52",
        "id": "8MUnrU8svJT4",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 867
        }
      },
      "source": [
        "vector_a = fasttext_model.wv.get_vector('বাংলা')\n",
        "print(vector_a)"
      ],
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[ 3.10908675e+00 -2.38071442e+00 -9.70043957e-01 -5.66025078e-01\n",
            " -1.07619822e+00  4.47757959e+00  1.04165363e+00 -3.94767380e+00\n",
            "  8.38609576e-01  1.27086356e-01  1.58916414e+00  1.51394141e+00\n",
            " -3.70067549e+00  1.15824379e-01 -1.39869475e+00  3.59312266e-01\n",
            " -1.59174860e+00 -1.45606112e+00 -3.01952982e+00 -1.06670964e+00\n",
            " -1.29315794e+00  3.36348724e+00  7.83140123e-01  6.18322074e-01\n",
            "  1.14647853e+00 -2.00278902e+00  9.76834223e-02 -9.06915814e-02\n",
            " -1.35848308e+00  2.61401916e+00  4.11325842e-01 -4.95882053e-03\n",
            " -1.85620356e+00  5.86531878e-01  1.08975947e+00 -3.19484925e+00\n",
            "  1.23863375e+00  2.76096940e+00  2.97603616e-03 -1.30080569e+00\n",
            "  2.34073353e+00 -1.85653794e+00  1.02879755e-01 -3.77274960e-01\n",
            " -2.59784341e-01  6.55955255e-01  8.40521753e-01 -1.47725356e+00\n",
            " -1.72506261e+00 -7.94822931e-01 -1.88007689e+00 -2.67613363e+00\n",
            " -4.13702548e-01  3.10200858e+00  1.60251498e+00 -2.42116499e+00\n",
            " -2.17445597e-01  5.44657588e-01 -1.67769241e+00 -3.09820700e+00\n",
            " -1.82247496e+00 -3.44184041e-01 -1.98292565e+00  1.74476191e-01\n",
            "  7.13144481e-01  1.84219733e-01 -4.89389211e-01  1.22903883e+00\n",
            " -4.62614506e-01  8.72249544e-01 -1.02143466e+00 -7.08562076e-01\n",
            "  4.35238071e-02 -4.38914359e-01 -1.78393006e+00  6.14556909e-01\n",
            "  2.00046968e+00 -8.33699644e-01 -1.65362835e-01  1.87251043e+00\n",
            "  2.25374508e+00  5.84853113e-01 -3.86706561e-01  3.64042020e+00\n",
            "  5.63881040e-01  2.74816084e+00  3.06213784e+00 -1.27971780e+00\n",
            "  6.31614104e-02 -5.19928942e-03 -1.30408287e+00 -1.02523315e+00\n",
            "  4.31055456e-01  2.32871556e+00 -1.75728738e+00  3.65903759e+00\n",
            "  1.15330887e+00  2.57397246e+00  1.75699246e+00  1.51429367e+00\n",
            " -5.08826971e-01  6.03686869e-01  1.06674775e-01 -1.83964896e+00\n",
            "  5.94799995e-01  2.07831573e+00  3.67107248e+00 -2.18767881e+00\n",
            "  1.45486808e+00  8.22293937e-01 -2.80425692e+00  8.74112919e-02\n",
            "  2.61130476e+00  1.90909696e+00 -2.17753386e+00 -1.29431283e+00\n",
            "  1.42674744e+00  2.51345158e+00  1.60967469e+00 -1.49379475e-02\n",
            "  1.82418466e+00  3.22541207e-01 -6.54721975e-01  1.93768248e-01\n",
            " -4.59367305e-01 -5.77465892e-01  5.70271313e-01  1.34173846e+00\n",
            " -2.81269073e+00 -5.05183041e-01  5.47760844e-01 -2.12078497e-01\n",
            " -1.20465791e+00  1.72587800e+00 -8.81767571e-01 -1.85678065e+00\n",
            " -1.73196900e+00  9.98399138e-01  2.41938639e+00 -1.84551394e+00\n",
            " -9.06164587e-01  6.02274477e-01  1.45480418e+00  1.88540429e-01\n",
            "  4.30855453e-01  1.64853406e+00  2.47411418e+00  1.92891765e+00\n",
            " -2.19521308e+00  4.46682781e-01  2.29912496e+00  2.62496662e+00\n",
            "  1.42786920e-01 -2.22294140e+00 -1.13450503e+00  7.42880940e-01\n",
            " -8.45801473e-01 -9.76455331e-01 -7.41640627e-01  2.71321321e+00\n",
            " -7.17415869e-01  3.15599829e-01 -7.29448199e-01  6.16044462e-01\n",
            "  1.56039202e+00  3.18467259e-01  3.07009757e-01  1.85720837e+00\n",
            "  2.67635882e-01  2.57449389e-01  1.40100133e+00 -4.22247839e+00\n",
            "  8.93778503e-02 -1.38147458e-01  9.96620238e-01 -1.24018073e+00\n",
            " -3.63333583e+00 -1.43686414e+00  9.18335259e-01  1.02571118e+00\n",
            " -1.52965581e+00 -2.65177935e-01 -2.10557431e-01  6.07441604e-01\n",
            " -2.89372039e+00 -1.29234934e+00 -5.66955984e-01  1.62534082e+00\n",
            "  1.42871010e+00 -1.20629227e+00  1.34718075e-01  1.88303065e+00\n",
            " -1.41340458e+00 -1.39459598e+00  7.42102444e-01 -2.43158758e-01\n",
            " -3.70771098e+00  1.77078104e+00  2.13890696e+00  7.48850524e-01]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "7ICb0qLSIetF",
        "colab": {}
      },
      "source": [
        "# শুরুতে ওয়ার্ড২ভেক ব্যবহার করছি, model এসেছে  ওয়ার্ড২ভেক\n",
        "print(\"ছেলে শব্দটার সবচেয়ে কাছাকাছি শব্দ কি?\")\n",
        "model.wv.most_similar('ছেলে', topn=5)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TYxJekyefvgD",
        "colab_type": "text"
      },
      "source": [
        "আমার পছন্দ ফাস্টটেক্সট, দেখি তাদের রেজাল্ট। "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CwFmlgLAFXwr",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "print(\"ফাস্টটেক্সট উদাহরণ: ছেলে শব্দটার সবচেয়ে কাছাকাছি শব্দ কি?\")\n",
        "fasttext_model.wv.most_similar('ছেলে', topn=5)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "vlf8VWhEIetJ",
        "colab": {}
      },
      "source": [
        "print(\"What is Father + Girl - Boy =?\")\n",
        "model.wv.most_similar(positive=['বাবা', 'মেয়ে'], negative=['ছেলে'], topn=5)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "K2eHIpTQG25f",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "print(model.wv['বাংলাদেশ'])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "vJz_u8ZQIetL",
        "colab": {}
      },
      "source": [
        "print('এখানে কোন শব্দটা যাচ্ছে না বাকিদের সাথে?')\n",
        "model.wv.doesnt_match(\"ঢাকা রাজশাহী রংপুর নজরুল\".split())"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Es7_9BJ2Mudn",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "print(model.wv.similarity('শিক্ষা', 'শিক্ষিত'))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "WDfvR_yUHZmY",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "semantically_similar_words = {words: [item[0] for item in model.wv.most_similar([words], topn=5)]\n",
        "                  for words in ['বাংলা', 'মাতা', 'একুশে', 'ভাষা', 'আনন্দ', 'আকাশ']}\n",
        "\n",
        "for k,v in semantically_similar_words.items():\n",
        "    print(k+\":\"+str(v))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tZWWvpNdIPlK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "semantically_similar_words = {words: [item[0] for item in fasttext_model.wv.most_similar([words], topn=5)]\n",
        "                  for words in ['বাংলা', 'মাতা', 'একুশে', 'ভাষা', 'আনন্দ', 'আকাশ']}\n",
        "\n",
        "for k,v in semantically_similar_words.items():\n",
        "    print(k+\":\"+str(v))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "aiRl4C0tljHW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from gensim.models.phrases import Phrases\n",
        "bigram = Phrases(tokenized_lines, min_count=3, threshold=10)\n",
        "print(bigram[tokenized_lines[0]])"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0C7-x-x6I1PO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from sklearn.decomposition import PCA\n",
        "\n",
        "all_similar_words = sum([[k] + v for k, v in semantically_similar_words.items()], [])\n",
        "\n",
        "print(all_similar_words)\n",
        "print(type(all_similar_words))\n",
        "print(len(all_similar_words))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5rygCrGZJJZr",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "word_vectors = fasttext_model.wv[all_similar_words]\n",
        "\n",
        "pca = PCA(n_components=2)\n",
        "\n",
        "p_comps = pca.fit_transform(word_vectors)\n",
        "word_names = all_similar_words\n",
        "\n",
        "import matplotlib.pyplot as plt\n",
        "plt.figure(figsize=(18, 10))\n",
        "plt.scatter(p_comps[:, 0], p_comps[:, 1], c='red')\n",
        "\n",
        "for word_names, x, y in zip(word_names, p_comps[:, 0], p_comps[:, 1]):\n",
        "    plt.annotate(word_names, xy=(x+0.06, y+0.03), xytext=(0, 0), textcoords='offset points')"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}