{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "text_classification.ipynb", "provenance": [], "collapsed_sections": [], "authorship_tag": "ABX9TyOSr9zFdQ4SSOAJ/mO4lVfv", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "tpO_fxIyVySA", "colab_type": "text" }, "source": [ "## আমাদের কর্পাস\n", "\n", "বাংলা নাচারাল ল্যাঙ্গুয়েজ প্রসেসিং\n", "\n", "১. টোকেনাইজেশন\n", "\n", "২. ভেক্টরাইজেশন" ] }, { "cell_type": "code", "metadata": { "id": "D3yDo75TdgkK", "colab_type": "code", "colab": {} }, "source": [ "sentences = ['আমি মাঝে মধ্যেই ফিরে যাই পুরানো কিছু ক্লাসিক বইয়ে', 'বিশেষ করে বেসিক ঝালাই করার জন্য']" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "qnhM3rgr25jd", "colab_type": "code", "outputId": "547f317e-1c19-4bd4-a522-ca98aed2d4c0", "colab": { "base_uri": "https://localhost:8080/", "height": 69 } }, "source": [ "import nltk\n", "nltk.download('punkt')" ], "execution_count": 55, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": { "tags": [] }, "execution_count": 55 } ] }, { "cell_type": "code", "metadata": { "id": "mOGXQnKRWGYP", "colab_type": "code", "outputId": "5ff83c04-4ac8-4214-a085-9e86b66a7632", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "# ট্রান্সফরমেশন তৈরি করি\n", "vectorizer = CountVectorizer()\n", "\n", "# টোকেনাইজ এবং ভোকাবুলারি তৈরি করি\n", "vectorizer.fit(sentences)\n", "\n", "# সামারি দেখি\n", "vectorizer.vocabulary_" ], "execution_count": 56, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'আম': 0, 'কর': 1, 'জন': 2, 'বইয়': 3, 'মধ': 4}" ] }, "metadata": { "tags": [] }, "execution_count": 56 } ] }, { "cell_type": "code", "metadata": { "id": "PrzmLNq0W7Dw", "colab_type": "code", "colab": {} }, "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "B7YCjhUbdlWL", "colab_type": "code", "outputId": "b28d89eb-10d8-4e53-c08e-693d83d7927b", "colab": { "base_uri": "https://localhost:8080/", "height": 277 } }, "source": [ "# ইউনিকোডে দেখুন নিচের লিঙ্কে\n", "# https://jrgraphix.net/r/Unicode/0980-09FF\n", "\n", "from nltk import word_tokenize\n", "\n", "vectorizer = CountVectorizer(encoding='utf-8', tokenizer=word_tokenize)\n", "\n", "vectorizer.fit(sentences)\n", "vectorizer.vocabulary_" ], "execution_count": 58, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'আমি': 0,\n", " 'করার': 1,\n", " 'করে': 2,\n", " 'কিছু': 3,\n", " 'ক্লাসিক': 4,\n", " 'জন্য': 5,\n", " 'ঝালাই': 6,\n", " 'পুরানো': 7,\n", " 'ফিরে': 8,\n", " 'বইয়ে': 9,\n", " 'বিশেষ': 10,\n", " 'বেসিক': 11,\n", " 'মধ্যেই': 12,\n", " 'মাঝে': 13,\n", " 'যাই': 14}" ] }, "metadata": { "tags": [] }, "execution_count": 58 } ] }, { "cell_type": "code", "metadata": { "id": "3aRbrXdvU-kB", "colab_type": "code", "outputId": "c0319153-7390-4980-cb26-9a5b0c34d773", "colab": { "base_uri": "https://localhost:8080/", "height": 55 } }, "source": [ "print(vectorizer.vocabulary_)" ], "execution_count": 59, "outputs": [ { "output_type": "stream", "text": [ "{'আমি': 0, 'মাঝে': 13, 'মধ্যেই': 12, 'ফিরে': 8, 'যাই': 14, 'পুরানো': 7, 'কিছু': 3, 'ক্লাসিক': 4, 'বইয়ে': 9, 'বিশেষ': 10, 'করে': 2, 'বেসিক': 11, 'ঝালাই': 6, 'করার': 1, 'জন্য': 5}\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "vp-68i2y4-p6", "colab_type": "code", "outputId": "b1744d15-ff45-4dde-bb96-581f92de9943", "colab": { "base_uri": "https://localhost:8080/", "height": 52 } }, "source": [ "vectorizer.transform(sentences).toarray()" ], "execution_count": 60, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1],\n", " [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0]])" ] }, "metadata": { "tags": [] }, "execution_count": 60 } ] }, { "cell_type": "code", "metadata": { "id": "Wdb8FgzeOE0-", "colab_type": "code", "outputId": "ce5dce6e-aa6a-42bb-c4f1-ae9e316fef90", "colab": { "base_uri": "https://localhost:8080/", "height": 52 } }, "source": [ "vec = CountVectorizer()\n", "x = vec.fit_transform(sentences).toarray()\n", "print(x.shape)\n", "print(vec.get_feature_names())" ], "execution_count": 61, "outputs": [ { "output_type": "stream", "text": [ "(2, 5)\n", "['আম', 'কর', 'জন', 'বইয়', 'মধ']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "43MkcTpZ6klp", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 277 }, "outputId": "055f7f1f-51fa-4ebc-82c7-539401057eea" }, "source": [ "# ইউনিকোডে দেখুন নিচের লিঙ্কে\n", "# https://jrgraphix.net/r/Unicode/0980-09FF\n", "\n", "vectorizer = CountVectorizer(encoding='utf-8', token_pattern=r'[\\u0980-\\u09ff]+')\n", "vectorizer.fit(sentences)\n", "vectorizer.vocabulary_" ], "execution_count": 62, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'আমি': 0,\n", " 'করার': 1,\n", " 'করে': 2,\n", " 'কিছু': 3,\n", " 'ক্লাসিক': 4,\n", " 'জন্য': 5,\n", " 'ঝালাই': 6,\n", " 'পুরানো': 7,\n", " 'ফিরে': 8,\n", " 'বইয়ে': 9,\n", " 'বিশেষ': 10,\n", " 'বেসিক': 11,\n", " 'মধ্যেই': 12,\n", " 'মাঝে': 13,\n", " 'যাই': 14}" ] }, "metadata": { "tags": [] }, "execution_count": 62 } ] }, { "cell_type": "code", "metadata": { "id": "F3UJHipiOma3", "colab_type": "code", "outputId": "57419cba-72a5-4202-f640-7594a8e1bddc", "colab": { "base_uri": "https://localhost:8080/", "height": 277 } }, "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "vectorizer =TfidfVectorizer(encoding='utf-8', tokenizer=word_tokenize)\n", "\n", "vectorizer.fit(sentences)\n", "vectorizer.vocabulary_" ], "execution_count": 63, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'আমি': 0,\n", " 'করার': 1,\n", " 'করে': 2,\n", " 'কিছু': 3,\n", " 'ক্লাসিক': 4,\n", " 'জন্য': 5,\n", " 'ঝালাই': 6,\n", " 'পুরানো': 7,\n", " 'ফিরে': 8,\n", " 'বইয়ে': 9,\n", " 'বিশেষ': 10,\n", " 'বেসিক': 11,\n", " 'মধ্যেই': 12,\n", " 'মাঝে': 13,\n", " 'যাই': 14}" ] }, "metadata": { "tags": [] }, "execution_count": 63 } ] }, { "cell_type": "code", "metadata": { "id": "Dsr8-tmwdp0O", "colab_type": "code", "outputId": "4bd0b4eb-abd9-44b8-ce30-28392851df49", "colab": { "base_uri": "https://localhost:8080/", "height": 121 } }, "source": [ "vectorizer.transform(sentences).toarray()" ], "execution_count": 64, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0.33333333, 0. , 0. , 0.33333333, 0.33333333,\n", " 0. , 0. , 0.33333333, 0.33333333, 0.33333333,\n", " 0. , 0. , 0.33333333, 0.33333333, 0.33333333],\n", " [0. , 0.40824829, 0.40824829, 0. , 0. ,\n", " 0.40824829, 0.40824829, 0. , 0. , 0. ,\n", " 0.40824829, 0.40824829, 0. , 0. , 0. ]])" ] }, "metadata": { "tags": [] }, "execution_count": 64 } ] }, { "cell_type": "code", "metadata": { "id": "qNEsv14BYRIQ", "colab_type": "code", "outputId": "5fee99ef-094e-4011-f4d6-bc50f96c75b4", "colab": { "base_uri": "https://localhost:8080/", "height": 69 } }, "source": [ "print(vectorizer.idf_)" ], "execution_count": 65, "outputs": [ { "output_type": "stream", "text": [ "[1.40546511 1.40546511 1.40546511 1.40546511 1.40546511 1.40546511\n", " 1.40546511 1.40546511 1.40546511 1.40546511 1.40546511 1.40546511\n", " 1.40546511 1.40546511 1.40546511]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "5vt20FLIYpoW", "colab_type": "code", "outputId": "01e1f0a3-c306-47e8-8031-f553259eec74", "colab": { "base_uri": "https://localhost:8080/", "height": 87 } }, "source": [ "vector = vectorizer.transform([sentences[0]])\n", "# এনকোডেড ভেক্টরকে সামারাইজ করি\n", "print(vector.shape)\n", "print(vector.toarray())" ], "execution_count": 66, "outputs": [ { "output_type": "stream", "text": [ "(1, 15)\n", "[[0.33333333 0. 0. 0.33333333 0.33333333 0.\n", " 0. 0.33333333 0.33333333 0.33333333 0. 0.\n", " 0.33333333 0.33333333 0.33333333]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "Sx0aljIreVvg", "colab_type": "code", "outputId": "17918534-dc76-492e-a78c-89b9199e2e87", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "source": [ "cities = ['ঢাকা', 'বার্লিন', 'কুমিল্লা', 'শিকাগো', 'সিঙ্গাপুর']\n", "cities" ], "execution_count": 67, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['ঢাকা', 'বার্লিন', 'কুমিল্লা', 'শিকাগো', 'সিঙ্গাপুর']" ] }, "metadata": { "tags": [] }, "execution_count": 67 } ] }, { "cell_type": "code", "metadata": { "id": "uaK8h7hgfbuW", "colab_type": "code", "colab": {} }, "source": [ "from sklearn.preprocessing import LabelEncoder" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8WYvhQsEffDV", "colab_type": "code", "outputId": "9c623882-50b3-4133-b132-bb39dd3a6ae9", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "source": [ "encoder = LabelEncoder()\n", "city_labels = encoder.fit_transform(cities)\n", "city_labels" ], "execution_count": 69, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([1, 2, 0, 3, 4])" ] }, "metadata": { "tags": [] }, "execution_count": 69 } ] }, { "cell_type": "code", "metadata": { "id": "GK4o6WGIhnxh", "colab_type": "code", "outputId": "861b1f05-e1ef-4e50-f09e-0f7d3193d942", "colab": { "base_uri": "https://localhost:8080/", "height": 104 } }, "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "\n", "encoder = OneHotEncoder(sparse=False)\n", "city_labels = city_labels.reshape((5, 1))\n", "encoder.fit_transform(city_labels)" ], "execution_count": 70, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0., 1., 0., 0., 0.],\n", " [0., 0., 1., 0., 0.],\n", " [1., 0., 0., 0., 0.],\n", " [0., 0., 0., 1., 0.],\n", " [0., 0., 0., 0., 1.]])" ] }, "metadata": { "tags": [] }, "execution_count": 70 } ] } ] }