{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "01-Token.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"cells": [
{
"metadata": {
"id": "Lg8oHBAuVnhM",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"# **Chapter 1 자연어 다루기**\n",
"## **Token:** 어휘 추출하기"
]
},
{
"metadata": {
"id": "zfiQShZIVnhO",
"colab_type": "code",
"outputId": "7191f933-39e9-4360-8777-35f1b4be6689",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 2037
}
},
"cell_type": "code",
"source": [
"! apt-get update\n",
"! apt-get install g++ openjdk-8-jdk \n",
"! pip3 install nltk konlpy\n",
"\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('tagsets')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"text_eng = \" Don't hesitate to ask questions\"\n",
"text_kor = \"\"\"삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \n",
"홍보:유관순 031-478-2311 010-8888-9999.\n",
"삼성 페이지 https://www.samsung.com/sec/index.html\"\"\""
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"\r0% [Working]\r \rIgn:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n",
"\r0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u\r \rGet:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,609 B]\n",
"\r0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u\r0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Connecting to security.u\r0% [2 InRelease gpgv 3,609 B] [Connecting to archive.ubuntu.com (91.189.88.152)\r \rIgn:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n",
"\r0% [2 InRelease gpgv 3,609 B] [Connecting to archive.ubuntu.com (91.189.88.152)\r \rHit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 Release\n",
"\r0% [2 InRelease gpgv 3,609 B] [Connecting to archive.ubuntu.com (91.189.88.152)\r \rGet:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release [564 B]\n",
"Get:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release.gpg [801 B]\n",
"Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n",
"Hit:8 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n",
"Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease\n",
"Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n",
"Hit:11 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease\n",
"Get:13 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Packages [6,819 B]\n",
"Get:14 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n",
"Get:15 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [363 kB]\n",
"Get:16 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [956 kB]\n",
"Get:17 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [161 kB]\n",
"Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [727 kB]\n",
"Fetched 2,470 kB in 3s (850 kB/s)\n",
"Reading package lists... Done\n",
"Reading package lists... Done\n",
"Building dependency tree \n",
"Reading state information... Done\n",
"g++ is already the newest version (4:7.3.0-3ubuntu2.1).\n",
"g++ set to manually installed.\n",
"The following package was automatically installed and is no longer required:\n",
" libnvidia-common-410\n",
"Use 'apt autoremove' to remove it.\n",
"The following additional packages will be installed:\n",
" fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java\n",
" libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jre x11-utils\n",
"Suggested packages:\n",
" openjdk-8-demo openjdk-8-source visualvm icedtea-8-plugin mesa-utils\n",
"The following NEW packages will be installed:\n",
" fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java\n",
" libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jdk openjdk-8-jre x11-utils\n",
"0 upgraded, 8 newly installed, 0 to remove and 6 not upgraded.\n",
"Need to get 4,771 kB of archives.\n",
"After this operation, 13.1 MB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]\n",
"Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-core all 2.37-1 [1,041 kB]\n",
"Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-extra all 2.37-1 [1,953 kB]\n",
"Get:4 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]\n",
"Get:5 http://archive.ubuntu.com/ubuntu bionic/main amd64 libatk-wrapper-java all 0.33.3-20ubuntu0.1 [34.7 kB]\n",
"Get:6 http://archive.ubuntu.com/ubuntu bionic/main amd64 libatk-wrapper-java-jni amd64 0.33.3-20ubuntu0.1 [28.3 kB]\n",
"Get:7 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre amd64 8u191-b12-2ubuntu0.18.04.1 [69.7 kB]\n",
"Get:8 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jdk amd64 8u191-b12-2ubuntu0.18.04.1 [1,435 kB]\n",
"Fetched 4,771 kB in 2s (3,041 kB/s)\n",
"Selecting previously unselected package libxxf86dga1:amd64.\n",
"(Reading database ... 131294 files and directories currently installed.)\n",
"Preparing to unpack .../0-libxxf86dga1_2%3a1.1.4-1_amd64.deb ...\n",
"Unpacking libxxf86dga1:amd64 (2:1.1.4-1) ...\n",
"Selecting previously unselected package fonts-dejavu-core.\n",
"Preparing to unpack .../1-fonts-dejavu-core_2.37-1_all.deb ...\n",
"Unpacking fonts-dejavu-core (2.37-1) ...\n",
"Selecting previously unselected package fonts-dejavu-extra.\n",
"Preparing to unpack .../2-fonts-dejavu-extra_2.37-1_all.deb ...\n",
"Unpacking fonts-dejavu-extra (2.37-1) ...\n",
"Selecting previously unselected package x11-utils.\n",
"Preparing to unpack .../3-x11-utils_7.7+3build1_amd64.deb ...\n",
"Unpacking x11-utils (7.7+3build1) ...\n",
"Selecting previously unselected package libatk-wrapper-java.\n",
"Preparing to unpack .../4-libatk-wrapper-java_0.33.3-20ubuntu0.1_all.deb ...\n",
"Unpacking libatk-wrapper-java (0.33.3-20ubuntu0.1) ...\n",
"Selecting previously unselected package libatk-wrapper-java-jni:amd64.\n",
"Preparing to unpack .../5-libatk-wrapper-java-jni_0.33.3-20ubuntu0.1_amd64.deb ...\n",
"Unpacking libatk-wrapper-java-jni:amd64 (0.33.3-20ubuntu0.1) ...\n",
"Selecting previously unselected package openjdk-8-jre:amd64.\n",
"Preparing to unpack .../6-openjdk-8-jre_8u191-b12-2ubuntu0.18.04.1_amd64.deb ...\n",
"Unpacking openjdk-8-jre:amd64 (8u191-b12-2ubuntu0.18.04.1) ...\n",
"Selecting previously unselected package openjdk-8-jdk:amd64.\n",
"Preparing to unpack .../7-openjdk-8-jdk_8u191-b12-2ubuntu0.18.04.1_amd64.deb ...\n",
"Unpacking openjdk-8-jdk:amd64 (8u191-b12-2ubuntu0.18.04.1) ...\n",
"Processing triggers for mime-support (3.60ubuntu1) ...\n",
"Setting up fonts-dejavu-core (2.37-1) ...\n",
"Setting up libxxf86dga1:amd64 (2:1.1.4-1) ...\n",
"Processing triggers for libc-bin (2.27-3ubuntu1) ...\n",
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
"Setting up fonts-dejavu-extra (2.37-1) ...\n",
"Processing triggers for hicolor-icon-theme (0.17-2) ...\n",
"Processing triggers for fontconfig (2.12.6-0ubuntu2) ...\n",
"Setting up x11-utils (7.7+3build1) ...\n",
"Setting up libatk-wrapper-java (0.33.3-20ubuntu0.1) ...\n",
"Setting up libatk-wrapper-java-jni:amd64 (0.33.3-20ubuntu0.1) ...\n",
"Setting up openjdk-8-jre:amd64 (8u191-b12-2ubuntu0.18.04.1) ...\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/policytool to provide /usr/bin/policytool (policytool) in auto mode\n",
"Setting up openjdk-8-jdk:amd64 (8u191-b12-2ubuntu0.18.04.1) ...\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/appletviewer to provide /usr/bin/appletviewer (appletviewer) in auto mode\n",
"update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jconsole to provide /usr/bin/jconsole (jconsole) in auto mode\n",
"Processing triggers for libc-bin (2.27-3ubuntu1) ...\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5)\n",
"Collecting konlpy\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/3d/4e983cd98d87b50b2ab0387d73fa946f745aa8164e8888a714d5129f9765/konlpy-0.5.1-py2.py3-none-any.whl (19.4MB)\n",
"\u001b[K 100% |████████████████████████████████| 19.4MB 1.9MB/s \n",
"\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.11.0)\n",
"Collecting JPype1>=0.5.7 (from konlpy)\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/c4/4b/60a3e63d51714d4d7ef1b1efdf84315d118a0a80a5b085bb52a7e2428cdc/JPype1-0.6.3.tar.gz (168kB)\n",
"\u001b[K 100% |████████████████████████████████| 174kB 29.0MB/s \n",
"\u001b[?25hBuilding wheels for collected packages: JPype1\n",
" Building wheel for JPype1 (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Stored in directory: /root/.cache/pip/wheels/0e/2b/e8/c0b818ac4b3d35104d35e48cdc7afe27fc06ea277feed2831a\n",
"Successfully built JPype1\n",
"Installing collected packages: JPype1, konlpy\n",
"Successfully installed JPype1-0.6.3 konlpy-0.5.1\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n",
"[nltk_data] Downloading package tagsets to /root/nltk_data...\n",
"[nltk_data] Unzipping help/tagsets.zip.\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /root/nltk_data...\n",
"[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "FqoBRqRhVnhR",
"colab_type": "code",
"outputId": "2926b985-e862-4a60-b1f9-f01d3390a720",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"text_kor"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \\n홍보:유관순 031-478-2311 010-8888-9999.\\n삼성 페이지 https://www.samsung.com/sec/index.html'"
]
},
"metadata": {
"tags": []
},
"execution_count": 2
}
]
},
{
"metadata": {
"id": "QOz0v8EFVnha",
"colab_type": "code",
"outputId": "3efe2748-069f-4dda-9544-9fd85cb0a2b5",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
}
},
"cell_type": "code",
"source": [
"from nltk import sent_tokenize, word_tokenize, FreqDist\n",
"sent_tokenize(text_kor)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다.',\n",
" '홍보:유관순 031-478-2311 010-8888-9999.',\n",
" '삼성 페이지 https://www.samsung.com/sec/index.html']"
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
},
{
"metadata": {
"id": "fXYSF-mFVnhe",
"colab_type": "code",
"outputId": "ae4d23fd-1c4f-402a-8869-fe5c76aaa667",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 399
}
},
"cell_type": "code",
"source": [
"tokens = word_tokenize(text_kor)\n",
"tokens"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['삼성',\n",
" '갤럭시',\n",
" '(',\n",
" 'GalaxyNote',\n",
" ')',\n",
" '노트의',\n",
" '신형을',\n",
" '홍보',\n",
" '합니다',\n",
" '.',\n",
" '홍보',\n",
" ':',\n",
" '유관순',\n",
" '031-478-2311',\n",
" '010-8888-9999',\n",
" '.',\n",
" '삼성',\n",
" '페이지',\n",
" 'https',\n",
" ':',\n",
" '//www.samsung.com/sec/index.html']"
]
},
"metadata": {
"tags": []
},
"execution_count": 4
}
]
},
{
"metadata": {
"scrolled": true,
"id": "o5YaSQ56Vnhk",
"colab_type": "code",
"outputId": "97da4ed0-915c-4de2-d7fe-1bc364994fe4",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 326
}
},
"cell_type": "code",
"source": [
"dict(FreqDist(tokens))"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'(': 1,\n",
" ')': 1,\n",
" '.': 2,\n",
" '//www.samsung.com/sec/index.html': 1,\n",
" '010-8888-9999': 1,\n",
" '031-478-2311': 1,\n",
" ':': 2,\n",
" 'GalaxyNote': 1,\n",
" 'https': 1,\n",
" '갤럭시': 1,\n",
" '노트의': 1,\n",
" '삼성': 2,\n",
" '신형을': 1,\n",
" '유관순': 1,\n",
" '페이지': 1,\n",
" '합니다': 1,\n",
" '홍보': 2}"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"metadata": {
"id": "VHMiLgPjVnhp",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"
\n",
"## **Re 를 사용한 Regex 정규식**\n",
"https://news.v.daum.net/v/20190223110230553"
]
},
{
"metadata": {
"id": "SY47J_R9Vnhr",
"colab_type": "code",
"outputId": "b7a14d4a-0ab9-4a70-afd6-63e91f6e66ee",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"text_kor"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \\n홍보:유관순 031-478-2311 010-8888-9999.\\n삼성 페이지 https://www.samsung.com/sec/index.html'"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"metadata": {
"id": "nlesrE4DVnhx",
"colab_type": "code",
"outputId": "b58fd283-1bf2-440a-abd2-45c2a32cd4ca",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"import re\n",
"tokenizer = re.compile(r'[가-힣]+')\n",
"tokenizer.findall(text_kor)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['삼성', '갤럭시', '노트의', '신형을', '홍보', '합니다', '홍보', '유관순', '삼성', '페이지']"
]
},
"metadata": {
"tags": []
},
"execution_count": 7
}
]
},
{
"metadata": {
"id": "wQopQUCYVnh2",
"colab_type": "code",
"outputId": "a40b4021-baa0-48fe-959b-f9f3fe280689",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"tokenizer = re.compile(r'[0-9]{3}-[0-9]{3,4}-[0-9]{4}')\n",
"tokenizer.findall(text_kor)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['031-478-2311', '010-8888-9999']"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"metadata": {
"id": "7adeQXBpVnh8",
"colab_type": "code",
"outputId": "f3e3820d-a6d0-4f70-dbae-2bce9a7e9d1a",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"tokenizer = re.compile(r'\\d{3}-\\d{3,4}-\\d{4}')\n",
"tokenizer.findall(text_kor)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['031-478-2311', '010-8888-9999']"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"metadata": {
"id": "YpvLQE9tVniC",
"colab_type": "code",
"outputId": "dd84922f-59c7-4b27-9840-eec2912a6486",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 145
}
},
"cell_type": "code",
"source": [
"tokenizer = re.compile(r'[^ 가-힣]+')\n",
"tokenizer.findall(text_kor)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['(GalaxyNote)',\n",
" '.',\n",
" '\\n',\n",
" ':',\n",
" '031-478-2311',\n",
" '010-8888-9999.\\n',\n",
" 'https://www.samsung.com/sec/index.html']"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"metadata": {
"id": "QkwBGQ4oVniJ",
"colab_type": "code",
"outputId": "9892a5fa-0f2d-4a83-8b53-ffd061a57301",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"tokenizer.sub(\"\", text_kor)#.split(\" \")"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'삼성 갤럭시노트의 신형을 홍보 합니다 홍보유관순 삼성 페이지 '"
]
},
"metadata": {
"tags": []
},
"execution_count": 11
}
]
},
{
"metadata": {
"id": "6IRvqkYQVniP",
"colab_type": "code",
"outputId": "ad076ab2-c01c-40d5-96bd-24b55a738fa0",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"tokenizer = re.compile(r'https://[w]{3}.[A-z]+.[./A-z]+')\n",
"tokenizer.findall(text_kor)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['https://www.samsung.com/sec/index.html']"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"metadata": {
"id": "w0nrJdw2VniY",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"
\n",
"# **Stemming / Tagging**\n",
"> **nltk**"
]
},
{
"metadata": {
"id": "iYNU0tIzVnia",
"colab_type": "code",
"outputId": "bbd260b6-2424-4a20-a440-99379fbb4c16",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"text_eng = text_eng.lower()\n",
"text_eng"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"\" don't hesitate to ask questions\""
]
},
"metadata": {
"tags": []
},
"execution_count": 13
}
]
},
{
"metadata": {
"id": "Fs1N6nOhVnig",
"colab_type": "code",
"outputId": "84ad1671-deaa-48f9-be1b-130730ee34d6",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"from nltk.tokenize import TreebankWordTokenizer\n",
"tokenizer = TreebankWordTokenizer()\n",
"token = tokenizer.tokenize(text_eng)\n",
"token"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['do', \"n't\", 'hesitate', 'to', 'ask', 'questions']"
]
},
"metadata": {
"tags": []
},
"execution_count": 14
}
]
},
{
"metadata": {
"id": "lgQbnTnVVnim",
"colab_type": "code",
"outputId": "499c5621-bb1a-4e4e-dfa6-f21d4f6ae318",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 126
}
},
"cell_type": "code",
"source": [
"from nltk import pos_tag\n",
"pos_tag(token)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('do', 'VBP'),\n",
" (\"n't\", 'RB'),\n",
" ('hesitate', 'VB'),\n",
" ('to', 'TO'),\n",
" ('ask', 'VB'),\n",
" ('questions', 'NNS')]"
]
},
"metadata": {
"tags": []
},
"execution_count": 15
}
]
},
{
"metadata": {
"id": "UCKTNS1fVnit",
"colab_type": "code",
"outputId": "7f546e31-98f8-4af8-9cf0-ffe84c7e1add",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
}
},
"cell_type": "code",
"source": [
"import nltk.help as nltk_help\n",
"nltk_help.upenn_tagset('PRP') # 대명사"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"PRP: pronoun, personal\n",
" hers herself him himself hisself it itself me myself one oneself ours\n",
" ourselves ownself self she thee theirs them themselves they thou thy us\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "uECK_BsKVni0",
"colab_type": "code",
"outputId": "15e81504-f720-49ce-a072-5fa7aa3dc646",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 90
}
},
"cell_type": "code",
"source": [
"nltk_help.upenn_tagset('JJ') # 형용사"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"JJ: adjective or numeral, ordinal\n",
" third ill-mannered pre-war regrettable oiled calamitous first separable\n",
" ectoplasmic battery-powered participatory fourth still-to-be-named\n",
" multilingual multi-disciplinary ...\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "F6ZWw4gTVni6",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"
\n",
"# **Stemming / Tagging (한글)**\n",
"> **konlpy**"
]
},
{
"metadata": {
"id": "k9411pX3Vni9",
"colab_type": "code",
"outputId": "339d4da9-fea1-44e2-eec1-3cad8a252843",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"from konlpy.tag import Okt\n",
"twitter = Okt()\n",
"\n",
"# Stemming\n",
"text = \"파이썬을 활용하여 자연어 분석 특강입니다\"\n",
"print(twitter.pos(text, stem=\"true\")) "
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하다', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('이다', 'Adjective')]\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "XauxL7EJVnjE",
"colab_type": "code",
"outputId": "983990ff-465d-42d1-f49d-b83d3e4406e0",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"cell_type": "code",
"source": [
"print(twitter.pos(text))"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하여', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('입니다', 'Adjective')]\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "rIHSsn-0VnjL",
"colab_type": "code",
"outputId": "97a5044f-2d8b-47c5-9c74-ccc42bfb349b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
}
},
"cell_type": "code",
"source": [
"%%time\n",
"from konlpy.tag import Kkma\n",
"kkma = Kkma()\n",
"print(kkma.pos(text))"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"[('파이', 'NNG'), ('썰', 'VV'), ('ㄴ', 'ETD'), ('을', 'NNG'), ('활용', 'NNG'), ('하', 'XSV'), ('여', 'ECS'), ('자연어', 'NNG'), ('분석', 'NNG'), ('특강', 'NNG'), ('이', 'VCP'), ('ㅂ니다', 'EFN')]\n",
"CPU times: user 17.1 s, sys: 584 ms, total: 17.7 s\n",
"Wall time: 9.41 s\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "PoNY-MvEVnjS",
"colab_type": "code",
"outputId": "e37bb3d8-01c6-4171-f51b-13019943a3d8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 72
}
},
"cell_type": "code",
"source": [
"%%time\n",
"from konlpy.tag import Hannanum\n",
"han = Hannanum()\n",
"print(han.pos(text))"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"[('파이썬', 'N'), ('을', 'J'), ('활용', 'N'), ('하', 'X'), ('어', 'E'), ('자연어', 'N'), ('분석', 'N'), ('특강', 'N'), ('이', 'J'), ('ㅂ니다', 'E')]\n",
"CPU times: user 4.28 s, sys: 69.4 ms, total: 4.35 s\n",
"Wall time: 2.2 s\n"
],
"name": "stdout"
}
]
}
]
}