{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "01-Token.ipynb", "version": "0.3.2", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "cells": [ { "metadata": { "id": "Lg8oHBAuVnhM", "colab_type": "text" }, "cell_type": "markdown", "source": [ "# **Chapter 1 자연어 다루기**\n", "## **Token:** 어휘 추출하기" ] }, { "metadata": { "id": "zfiQShZIVnhO", "colab_type": "code", "outputId": "7191f933-39e9-4360-8777-35f1b4be6689", "colab": { "base_uri": "https://localhost:8080/", "height": 2037 } }, "cell_type": "code", "source": [ "! apt-get update\n", "! apt-get install g++ openjdk-8-jdk \n", "! pip3 install nltk konlpy\n", "\n", "import nltk\n", "nltk.download('punkt')\n", "nltk.download('tagsets')\n", "nltk.download('averaged_perceptron_tagger')\n", "text_eng = \" Don't hesitate to ask questions\"\n", "text_kor = \"\"\"삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \n", "홍보:유관순 031-478-2311 010-8888-9999.\n", "삼성 페이지 https://www.samsung.com/sec/index.html\"\"\"" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "\r0% [Working]\r \rIgn:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n", "\r0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u\r \rGet:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,609 B]\n", "\r0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u\r0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u\r0% [2 InRelease gpgv 3,609 B] [Connecting to archive.ubuntu.com (91.189.88.152)\r \rIgn:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", "\r0% [2 InRelease gpgv 3,609 B] [Connecting to archive.ubuntu.com (91.189.88.152)\r \rHit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 Release\n", "\r0% [2 InRelease gpgv 3,609 B] [Connecting to archive.ubuntu.com (91.189.88.152)\r \rGet:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release [564 B]\n", "Get:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release.gpg [801 B]\n", "Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n", "Hit:8 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", "Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease\n", "Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n", "Hit:11 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease\n", "Get:13 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Packages [6,819 B]\n", "Get:14 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n", "Get:15 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [363 kB]\n", "Get:16 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [956 kB]\n", "Get:17 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [161 kB]\n", "Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [727 kB]\n", "Fetched 2,470 kB in 3s (850 kB/s)\n", "Reading package lists... Done\n", "Reading package lists... Done\n", "Building dependency tree \n", "Reading state information... Done\n", "g++ is already the newest version (4:7.3.0-3ubuntu2.1).\n", "g++ set to manually installed.\n", "The following package was automatically installed and is no longer required:\n", " libnvidia-common-410\n", "Use 'apt autoremove' to remove it.\n", "The following additional packages will be installed:\n", " fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java\n", " libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jre x11-utils\n", "Suggested packages:\n", " openjdk-8-demo openjdk-8-source visualvm icedtea-8-plugin mesa-utils\n", "The following NEW packages will be installed:\n", " fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java\n", " libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jdk openjdk-8-jre x11-utils\n", "0 upgraded, 8 newly installed, 0 to remove and 6 not upgraded.\n", "Need to get 4,771 kB of archives.\n", "After this operation, 13.1 MB of additional disk space will be used.\n", "Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]\n", "Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-core all 2.37-1 [1,041 kB]\n", "Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-extra all 2.37-1 [1,953 kB]\n", "Get:4 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]\n", "Get:5 http://archive.ubuntu.com/ubuntu bionic/main amd64 libatk-wrapper-java all 0.33.3-20ubuntu0.1 [34.7 kB]\n", "Get:6 http://archive.ubuntu.com/ubuntu bionic/main amd64 libatk-wrapper-java-jni amd64 0.33.3-20ubuntu0.1 [28.3 kB]\n", "Get:7 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre amd64 8u191-b12-2ubuntu0.18.04.1 [69.7 kB]\n", "Get:8 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jdk amd64 8u191-b12-2ubuntu0.18.04.1 [1,435 kB]\n", "Fetched 4,771 kB in 2s (3,041 kB/s)\n", "Selecting previously unselected package libxxf86dga1:amd64.\n", "(Reading database ... 131294 files and directories currently installed.)\n", "Preparing to unpack .../0-libxxf86dga1_2%3a1.1.4-1_amd64.deb ...\n", "Unpacking libxxf86dga1:amd64 (2:1.1.4-1) ...\n", "Selecting previously unselected package fonts-dejavu-core.\n", "Preparing to unpack .../1-fonts-dejavu-core_2.37-1_all.deb ...\n", "Unpacking fonts-dejavu-core (2.37-1) ...\n", "Selecting previously unselected package fonts-dejavu-extra.\n", "Preparing to unpack .../2-fonts-dejavu-extra_2.37-1_all.deb ...\n", "Unpacking fonts-dejavu-extra (2.37-1) ...\n", "Selecting previously unselected package x11-utils.\n", "Preparing to unpack .../3-x11-utils_7.7+3build1_amd64.deb ...\n", "Unpacking x11-utils (7.7+3build1) ...\n", "Selecting previously unselected package libatk-wrapper-java.\n", "Preparing to unpack .../4-libatk-wrapper-java_0.33.3-20ubuntu0.1_all.deb ...\n", "Unpacking libatk-wrapper-java (0.33.3-20ubuntu0.1) ...\n", "Selecting previously unselected package libatk-wrapper-java-jni:amd64.\n", "Preparing to unpack .../5-libatk-wrapper-java-jni_0.33.3-20ubuntu0.1_amd64.deb ...\n", "Unpacking libatk-wrapper-java-jni:amd64 (0.33.3-20ubuntu0.1) ...\n", "Selecting previously unselected package openjdk-8-jre:amd64.\n", "Preparing to unpack .../6-openjdk-8-jre_8u191-b12-2ubuntu0.18.04.1_amd64.deb ...\n", "Unpacking openjdk-8-jre:amd64 (8u191-b12-2ubuntu0.18.04.1) ...\n", "Selecting previously unselected package openjdk-8-jdk:amd64.\n", "Preparing to unpack .../7-openjdk-8-jdk_8u191-b12-2ubuntu0.18.04.1_amd64.deb ...\n", "Unpacking openjdk-8-jdk:amd64 (8u191-b12-2ubuntu0.18.04.1) ...\n", "Processing triggers for mime-support (3.60ubuntu1) ...\n", "Setting up fonts-dejavu-core (2.37-1) ...\n", "Setting up libxxf86dga1:amd64 (2:1.1.4-1) ...\n", "Processing triggers for libc-bin (2.27-3ubuntu1) ...\n", "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", "Setting up fonts-dejavu-extra (2.37-1) ...\n", "Processing triggers for hicolor-icon-theme (0.17-2) ...\n", "Processing triggers for fontconfig (2.12.6-0ubuntu2) ...\n", "Setting up x11-utils (7.7+3build1) ...\n", "Setting up libatk-wrapper-java (0.33.3-20ubuntu0.1) ...\n", "Setting up libatk-wrapper-java-jni:amd64 (0.33.3-20ubuntu0.1) ...\n", "Setting up openjdk-8-jre:amd64 (8u191-b12-2ubuntu0.18.04.1) ...\n", "update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/policytool to provide /usr/bin/policytool (policytool) in auto mode\n", "Setting up openjdk-8-jdk:amd64 (8u191-b12-2ubuntu0.18.04.1) ...\n", "update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/appletviewer to provide /usr/bin/appletviewer (appletviewer) in auto mode\n", "update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jconsole to provide /usr/bin/jconsole (jconsole) in auto mode\n", "Processing triggers for libc-bin (2.27-3ubuntu1) ...\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5)\n", "Collecting konlpy\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/3d/4e983cd98d87b50b2ab0387d73fa946f745aa8164e8888a714d5129f9765/konlpy-0.5.1-py2.py3-none-any.whl (19.4MB)\n", "\u001b[K 100% |████████████████████████████████| 19.4MB 1.9MB/s \n", "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.11.0)\n", "Collecting JPype1>=0.5.7 (from konlpy)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c4/4b/60a3e63d51714d4d7ef1b1efdf84315d118a0a80a5b085bb52a7e2428cdc/JPype1-0.6.3.tar.gz (168kB)\n", "\u001b[K 100% |████████████████████████████████| 174kB 29.0MB/s \n", "\u001b[?25hBuilding wheels for collected packages: JPype1\n", " Building wheel for JPype1 (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Stored in directory: /root/.cache/pip/wheels/0e/2b/e8/c0b818ac4b3d35104d35e48cdc7afe27fc06ea277feed2831a\n", "Successfully built JPype1\n", "Installing collected packages: JPype1, konlpy\n", "Successfully installed JPype1-0.6.3 konlpy-0.5.1\n", "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n", "[nltk_data] Downloading package tagsets to /root/nltk_data...\n", "[nltk_data] Unzipping help/tagsets.zip.\n", "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n" ], "name": "stdout" } ] }, { "metadata": { "id": "FqoBRqRhVnhR", "colab_type": "code", "outputId": "2926b985-e862-4a60-b1f9-f01d3390a720", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "text_kor" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \\n홍보:유관순 031-478-2311 010-8888-9999.\\n삼성 페이지 https://www.samsung.com/sec/index.html'" ] }, "metadata": { "tags": [] }, "execution_count": 2 } ] }, { "metadata": { "id": "QOz0v8EFVnha", "colab_type": "code", "outputId": "3efe2748-069f-4dda-9544-9fd85cb0a2b5", "colab": { "base_uri": "https://localhost:8080/", "height": 72 } }, "cell_type": "code", "source": [ "from nltk import sent_tokenize, word_tokenize, FreqDist\n", "sent_tokenize(text_kor)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다.',\n", " '홍보:유관순 031-478-2311 010-8888-9999.',\n", " '삼성 페이지 https://www.samsung.com/sec/index.html']" ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] }, { "metadata": { "id": "fXYSF-mFVnhe", "colab_type": "code", "outputId": "ae4d23fd-1c4f-402a-8869-fe5c76aaa667", "colab": { "base_uri": "https://localhost:8080/", "height": 399 } }, "cell_type": "code", "source": [ "tokens = word_tokenize(text_kor)\n", "tokens" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['삼성',\n", " '갤럭시',\n", " '(',\n", " 'GalaxyNote',\n", " ')',\n", " '노트의',\n", " '신형을',\n", " '홍보',\n", " '합니다',\n", " '.',\n", " '홍보',\n", " ':',\n", " '유관순',\n", " '031-478-2311',\n", " '010-8888-9999',\n", " '.',\n", " '삼성',\n", " '페이지',\n", " 'https',\n", " ':',\n", " '//www.samsung.com/sec/index.html']" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "metadata": { "scrolled": true, "id": "o5YaSQ56Vnhk", "colab_type": "code", "outputId": "97da4ed0-915c-4de2-d7fe-1bc364994fe4", "colab": { "base_uri": "https://localhost:8080/", "height": 326 } }, "cell_type": "code", "source": [ "dict(FreqDist(tokens))" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'(': 1,\n", " ')': 1,\n", " '.': 2,\n", " '//www.samsung.com/sec/index.html': 1,\n", " '010-8888-9999': 1,\n", " '031-478-2311': 1,\n", " ':': 2,\n", " 'GalaxyNote': 1,\n", " 'https': 1,\n", " '갤럭시': 1,\n", " '노트의': 1,\n", " '삼성': 2,\n", " '신형을': 1,\n", " '유관순': 1,\n", " '페이지': 1,\n", " '합니다': 1,\n", " '홍보': 2}" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "metadata": { "id": "VHMiLgPjVnhp", "colab_type": "text" }, "cell_type": "markdown", "source": [ "
\n", "## **Re 를 사용한 Regex 정규식**\n", "https://news.v.daum.net/v/20190223110230553" ] }, { "metadata": { "id": "SY47J_R9Vnhr", "colab_type": "code", "outputId": "b7a14d4a-0ab9-4a70-afd6-63e91f6e66ee", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "text_kor" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \\n홍보:유관순 031-478-2311 010-8888-9999.\\n삼성 페이지 https://www.samsung.com/sec/index.html'" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "metadata": { "id": "nlesrE4DVnhx", "colab_type": "code", "outputId": "b58fd283-1bf2-440a-abd2-45c2a32cd4ca", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "import re\n", "tokenizer = re.compile(r'[가-힣]+')\n", "tokenizer.findall(text_kor)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['삼성', '갤럭시', '노트의', '신형을', '홍보', '합니다', '홍보', '유관순', '삼성', '페이지']" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "metadata": { "id": "wQopQUCYVnh2", "colab_type": "code", "outputId": "a40b4021-baa0-48fe-959b-f9f3fe280689", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "tokenizer = re.compile(r'[0-9]{3}-[0-9]{3,4}-[0-9]{4}')\n", "tokenizer.findall(text_kor)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['031-478-2311', '010-8888-9999']" ] }, "metadata": { "tags": [] }, "execution_count": 8 } ] }, { "metadata": { "id": "7adeQXBpVnh8", "colab_type": "code", "outputId": "f3e3820d-a6d0-4f70-dbae-2bce9a7e9d1a", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "tokenizer = re.compile(r'\\d{3}-\\d{3,4}-\\d{4}')\n", "tokenizer.findall(text_kor)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['031-478-2311', '010-8888-9999']" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "metadata": { "id": "YpvLQE9tVniC", "colab_type": "code", "outputId": "dd84922f-59c7-4b27-9840-eec2912a6486", "colab": { "base_uri": "https://localhost:8080/", "height": 145 } }, "cell_type": "code", "source": [ "tokenizer = re.compile(r'[^ 가-힣]+')\n", "tokenizer.findall(text_kor)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['(GalaxyNote)',\n", " '.',\n", " '\\n',\n", " ':',\n", " '031-478-2311',\n", " '010-8888-9999.\\n',\n", " 'https://www.samsung.com/sec/index.html']" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "metadata": { "id": "QkwBGQ4oVniJ", "colab_type": "code", "outputId": "9892a5fa-0f2d-4a83-8b53-ffd061a57301", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "tokenizer.sub(\"\", text_kor)#.split(\" \")" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'삼성 갤럭시노트의 신형을 홍보 합니다 홍보유관순 삼성 페이지 '" ] }, "metadata": { "tags": [] }, "execution_count": 11 } ] }, { "metadata": { "id": "6IRvqkYQVniP", "colab_type": "code", "outputId": "ad076ab2-c01c-40d5-96bd-24b55a738fa0", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "tokenizer = re.compile(r'https://[w]{3}.[A-z]+.[./A-z]+')\n", "tokenizer.findall(text_kor)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['https://www.samsung.com/sec/index.html']" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "metadata": { "id": "w0nrJdw2VniY", "colab_type": "text" }, "cell_type": "markdown", "source": [ "
\n", "# **Stemming / Tagging**\n", "> **nltk**" ] }, { "metadata": { "id": "iYNU0tIzVnia", "colab_type": "code", "outputId": "bbd260b6-2424-4a20-a440-99379fbb4c16", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "text_eng = text_eng.lower()\n", "text_eng" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "\" don't hesitate to ask questions\"" ] }, "metadata": { "tags": [] }, "execution_count": 13 } ] }, { "metadata": { "id": "Fs1N6nOhVnig", "colab_type": "code", "outputId": "84ad1671-deaa-48f9-be1b-130730ee34d6", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "from nltk.tokenize import TreebankWordTokenizer\n", "tokenizer = TreebankWordTokenizer()\n", "token = tokenizer.tokenize(text_eng)\n", "token" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['do', \"n't\", 'hesitate', 'to', 'ask', 'questions']" ] }, "metadata": { "tags": [] }, "execution_count": 14 } ] }, { "metadata": { "id": "lgQbnTnVVnim", "colab_type": "code", "outputId": "499c5621-bb1a-4e4e-dfa6-f21d4f6ae318", "colab": { "base_uri": "https://localhost:8080/", "height": 126 } }, "cell_type": "code", "source": [ "from nltk import pos_tag\n", "pos_tag(token)" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[('do', 'VBP'),\n", " (\"n't\", 'RB'),\n", " ('hesitate', 'VB'),\n", " ('to', 'TO'),\n", " ('ask', 'VB'),\n", " ('questions', 'NNS')]" ] }, "metadata": { "tags": [] }, "execution_count": 15 } ] }, { "metadata": { "id": "UCKTNS1fVnit", "colab_type": "code", "outputId": "7f546e31-98f8-4af8-9cf0-ffe84c7e1add", "colab": { "base_uri": "https://localhost:8080/", "height": 72 } }, "cell_type": "code", "source": [ "import nltk.help as nltk_help\n", "nltk_help.upenn_tagset('PRP') # 대명사" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "PRP: pronoun, personal\n", " hers herself him himself hisself it itself me myself one oneself ours\n", " ourselves ownself self she thee theirs them themselves they thou thy us\n" ], "name": "stdout" } ] }, { "metadata": { "id": "uECK_BsKVni0", "colab_type": "code", "outputId": "15e81504-f720-49ce-a072-5fa7aa3dc646", "colab": { "base_uri": "https://localhost:8080/", "height": 90 } }, "cell_type": "code", "source": [ "nltk_help.upenn_tagset('JJ') # 형용사" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "JJ: adjective or numeral, ordinal\n", " third ill-mannered pre-war regrettable oiled calamitous first separable\n", " ectoplasmic battery-powered participatory fourth still-to-be-named\n", " multilingual multi-disciplinary ...\n" ], "name": "stdout" } ] }, { "metadata": { "id": "F6ZWw4gTVni6", "colab_type": "text" }, "cell_type": "markdown", "source": [ "
\n", "# **Stemming / Tagging (한글)**\n", "> **konlpy**" ] }, { "metadata": { "id": "k9411pX3Vni9", "colab_type": "code", "outputId": "339d4da9-fea1-44e2-eec1-3cad8a252843", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "from konlpy.tag import Okt\n", "twitter = Okt()\n", "\n", "# Stemming\n", "text = \"파이썬을 활용하여 자연어 분석 특강입니다\"\n", "print(twitter.pos(text, stem=\"true\")) " ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하다', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('이다', 'Adjective')]\n" ], "name": "stdout" } ] }, { "metadata": { "id": "XauxL7EJVnjE", "colab_type": "code", "outputId": "983990ff-465d-42d1-f49d-b83d3e4406e0", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "cell_type": "code", "source": [ "print(twitter.pos(text))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하여', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('입니다', 'Adjective')]\n" ], "name": "stdout" } ] }, { "metadata": { "id": "rIHSsn-0VnjL", "colab_type": "code", "outputId": "97a5044f-2d8b-47c5-9c74-ccc42bfb349b", "colab": { "base_uri": "https://localhost:8080/", "height": 72 } }, "cell_type": "code", "source": [ "%%time\n", "from konlpy.tag import Kkma\n", "kkma = Kkma()\n", "print(kkma.pos(text))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "[('파이', 'NNG'), ('썰', 'VV'), ('ㄴ', 'ETD'), ('을', 'NNG'), ('활용', 'NNG'), ('하', 'XSV'), ('여', 'ECS'), ('자연어', 'NNG'), ('분석', 'NNG'), ('특강', 'NNG'), ('이', 'VCP'), ('ㅂ니다', 'EFN')]\n", "CPU times: user 17.1 s, sys: 584 ms, total: 17.7 s\n", "Wall time: 9.41 s\n" ], "name": "stdout" } ] }, { "metadata": { "id": "PoNY-MvEVnjS", "colab_type": "code", "outputId": "e37bb3d8-01c6-4171-f51b-13019943a3d8", "colab": { "base_uri": "https://localhost:8080/", "height": 72 } }, "cell_type": "code", "source": [ "%%time\n", "from konlpy.tag import Hannanum\n", "han = Hannanum()\n", "print(han.pos(text))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "[('파이썬', 'N'), ('을', 'J'), ('활용', 'N'), ('하', 'X'), ('어', 'E'), ('자연어', 'N'), ('분석', 'N'), ('특강', 'N'), ('이', 'J'), ('ㅂ니다', 'E')]\n", "CPU times: user 4.28 s, sys: 69.4 ms, total: 4.35 s\n", "Wall time: 2.2 s\n" ], "name": "stdout" } ] } ] }