{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "machine_translation.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "lJI_2lVuorP7" }, "source": [ "# PyThaiNLP Translate" ] }, { "cell_type": "markdown", "metadata": { "id": "VrfKSqsCpwGy" }, "source": [ "\n", "\n", "We used machine translation model from [The VISTEC-depa Thailand Artificial Intelligence Research Institute](https://airesearch.in.th/releases/machine-translation-models/)." ] }, { "cell_type": "markdown", "metadata": { "id": "E5lyybeKpw7L" }, "source": [ "## Install" ] }, { "cell_type": "code", "metadata": { "id": "1r4L-oDsoZ2Q", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e273bd3c-74da-4163-9305-1aabdbcac57f" }, "source": [ "!pip install fairseq" ], "execution_count": 1, "outputs": [ { "output_type": "stream", "text": [ "Collecting fairseq\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/15/ab/92c6efb05ffdfe16fbdc9e463229d9af8c3b74dc943ed4b4857a87b223c2/fairseq-0.10.2-cp37-cp37m-manylinux1_x86_64.whl (1.7MB)\n", "\u001b[K |████████████████████████████████| 1.7MB 5.7MB/s \n", "\u001b[?25hCollecting dataclasses\n", " Downloading https://files.pythonhosted.org/packages/26/2f/1095cdc2868052dd1e64520f7c0d5c8c550ad297e944e641dbf1ffbb9a5d/dataclasses-0.6-py3-none-any.whl\n", "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq) (0.29.22)\n", "Collecting hydra-core\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n", "\u001b[K |████████████████████████████████| 133kB 14.5MB/s \n", "\u001b[?25hRequirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq) (1.14.5)\n", "Collecting sacrebleu>=1.4.12\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n", "\u001b[K |████████████████████████████████| 61kB 6.3MB/s \n", "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq) (4.41.1)\n", "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq) (1.8.0+cu101)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq) (2019.12.20)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from fairseq) (1.19.5)\n", "Collecting omegaconf<2.1,>=2.0.5\n", " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n", "Collecting antlr4-python3-runtime==4.8\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n", "\u001b[K |████████████████████████████████| 112kB 18.4MB/s \n", "\u001b[?25hRequirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core->fairseq) (5.1.2)\n", "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq) (2.20)\n", "Collecting portalocker==2.0.0\n", " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch->fairseq) (3.7.4.3)\n", "Collecting PyYAML>=5.1.*\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n", "\u001b[K |████████████████████████████████| 645kB 18.0MB/s \n", "\u001b[?25hRequirement already satisfied: zipp>=0.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core->fairseq) (3.4.1)\n", "Building wheels for collected packages: antlr4-python3-runtime\n", " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=7443fbcc47b93d3b320b897cf91d8b947b6fdc6a0795dcce01ed16fd31c8ab6d\n", " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n", "Successfully built antlr4-python3-runtime\n", "Installing collected packages: dataclasses, PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, portalocker, sacrebleu, fairseq\n", " Found existing installation: PyYAML 3.13\n", " Uninstalling PyYAML-3.13:\n", " Successfully uninstalled PyYAML-3.13\n", "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 dataclasses-0.6 fairseq-0.10.2 hydra-core-1.0.6 omegaconf-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "wNtI0ghs7abS", "outputId": "af41d635-20bd-4508-c04a-1e9b778c2efb", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "!pip install sacremoses sentencepiece" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (0.0.43)\n", "Collecting sentencepiece\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)\n", "\u001b[K |████████████████████████████████| 1.2MB 4.3MB/s \n", "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n", "Installing collected packages: sentencepiece\n", "Successfully installed sentencepiece-0.1.95\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "yDqmlASopIgT", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e405c95a-b5ad-4950-80ab-70dada679125" }, "source": [ "!pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "Collecting https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", " Using cached https://github.com/PyThaiNLP/pythainlp/archive/dev.zip\n", "Requirement already satisfied (use --upgrade to upgrade): pythainlp==2.3.0.dev0 from https://github.com/PyThaiNLP/pythainlp/archive/dev.zip in /usr/local/lib/python3.7/dist-packages\n", "Requirement already satisfied: python-crfsuite>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from pythainlp==2.3.0.dev0) (0.9.7)\n", "Requirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.7/dist-packages (from pythainlp==2.3.0.dev0) (2.23.0)\n", "Requirement already satisfied: tinydb>=3.0 in /usr/local/lib/python3.7/dist-packages (from pythainlp==2.3.0.dev0) (4.4.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->pythainlp==2.3.0.dev0) (2020.12.5)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->pythainlp==2.3.0.dev0) (1.24.3)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->pythainlp==2.3.0.dev0) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.22.0->pythainlp==2.3.0.dev0) (2.10)\n", "Building wheels for collected packages: pythainlp\n", " Building wheel for pythainlp (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for pythainlp: filename=pythainlp-2.3.0.dev0-cp37-none-any.whl size=11003566 sha256=b64ebc4010c51f2644c15473edd0c49540644725a367c28baa0d3f3e19edcccb\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-zkojv2_o/wheels/79/4e/1e/26f3198c6712ecfbee92928ed1dde923a078da3d222401cc78\n", "Successfully built pythainlp\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "BKFwp-KCpazZ" }, "source": [ "**All download and install model**" ] }, { "cell_type": "code", "metadata": { "id": "SUEFsm8PpRk4" }, "source": [ "from pythainlp.translate import download_model_all" ], "execution_count": 11, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "e6KZgZKwpmDa", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "244fc402-680b-4773-a1fb-89470f795d91" }, "source": [ "download_model_all()" ], "execution_count": 12, "outputs": [ { "output_type": "stream", "text": [ "Corpus: scb_1m_en-th_moses\n", "- Downloading: scb_1m_en-th_moses 1.0\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "100%|██████████| 1174648148/1174648148 [00:14<00:00, 81506882.14it/s]\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "Corpus: scb_1m_th-en_spm\n", "- Downloading: scb_1m_th-en_spm 1.0\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "100%|██████████| 703780432/703780432 [00:08<00:00, 78234386.81it/s]\n" ], "name": "stderr" } ] }, { "cell_type": "markdown", "metadata": { "id": "d8-XXPCmqofY" }, "source": [ "## Translate" ] }, { "cell_type": "markdown", "metadata": { "id": "QkC0f0elqtU1" }, "source": [ "### Import" ] }, { "cell_type": "code", "metadata": { "id": "mNg0wxkEqszb" }, "source": [ "from pythainlp.translate import EnThTranslator, ThEnTranslator" ], "execution_count": 1, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "YFOIgUE2rdri" }, "source": [ "`EnThTranslator/ThEnTranslator.translate(text)`\n", "\n", "- text : text\n", "\n", "#### List language\n", "- `th` is Thai language\n", "- `en` is English language" ] }, { "cell_type": "markdown", "metadata": { "id": "pOO38Tvupnsr" }, "source": [ "### English to Thai" ] }, { "cell_type": "markdown", "metadata": { "id": "01VmL_Tspr4F" }, "source": [ "We have 1 model\n", "\n", "- `scb_1m_en-th_moses` - `bpe` tokenizer" ] }, { "cell_type": "code", "metadata": { "id": "UEDoyGYkqiGl", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "438ec030-d23e-4ec4-eb19-c6ddd8e794cb" }, "source": [ "print(EnThTranslator().translate(\"I want fried chicken.\"))" ], "execution_count": 14, "outputs": [ { "output_type": "stream", "text": [ "ไก่ทอดค่ะ\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "9MKFtxeFq1BS" }, "source": [ "### Thai to English\n", "\n", "We have 2 model\n", "\n", "- `scb_1m_en-th_moses` - `bpe` tokenizer" ] }, { "cell_type": "code", "metadata": { "id": "zS7SnFfaq6BS", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e69fd764-1a5c-48e0-8386-f58574b8cf56" }, "source": [ "print(ThEnTranslator().translate(\"ผมอยากกินไก่ทอด\"))" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "text": [ "I want fried chicken.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "yPKqpk0XrRMI", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "f44a2faf-b328-4d77-d421-0b0f3e6ca051" }, "source": [ "print(ThEnTranslator().translate(\"ผมอยากเขียนโปรแกรมคอมพิวเตอร์\"))" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "I want to write a computer program.\n" ], "name": "stdout" } ] } ] }