{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "# **tf-idf**\n", "\n", "## **1 Document 자료를 불러오기**\n", "sklearn을 활용한 tf-idf 계산\n", "[**연간 기업결과 리포트**](https://news.samsung.com/global/samsung-electronics-announces-fourth-quarter-and-fy-2017-results)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'samsung electronics posted krw 65.98 trillion in consolidated revenue and krw 15.15 trillion in operating profit for the fourth quarter of 2017.\\n \\noverall, the company reported full-year revenue of krw 239.58 trillion and full-year operating profit of krw 53.65 trillion.\\n \\nfourth quarter earnings we'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Document 자료를 불러온다 : 2017년 연간결산 리포트\n", "with open('../data/News2017.txt', 'r', encoding='utf-8') as f:\n", " texts = f.read()\n", "\n", "texts = texts.lower()\n", "texts[:300]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'samsung electronics posted krw trillion in consolidated revenue and krw trillion in operating profit for the fourth quarter of overall the company reported full year revenue of krw trillion and full year operating profit of krw trillion fourth quarter earnings were driven by the components business '" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 영문 Token만 추출한다\n", "from nltk.tokenize import RegexpTokenizer\n", "re_capt = RegexpTokenizer(r'[a-z]\\w+')\n", "tokens = re_capt.tokenize(texts)\n", "document = \" \".join(tokens)\n", "document[:300]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "the 153\n", "and 100\n", "to 95\n", "in 77\n", "for 74\n", "of 53\n", "will 37\n", "as 35\n", "demand 33\n", "by 32\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 추출한 Token의 빈도를 계산한다\n", "from nltk import FreqDist\n", "import pandas as pd\n", "token_freq = FreqDist(tokens)\n", "token_freq = pd.Series(token_freq).sort_values(ascending=False)\n", "token_freq[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "## **2 sklean 을 활용한 tf idf 계산**\n", "sklearn의 기본 데이터를 활용하여 tf-idf 결과값 출력" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# ! pip3 install sklearn" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# ! pip3 install scipy" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.01705916, 0.00852958, 0.00852958, 0.00852958, 0.00852958,\n", " 0.00852958, 0.00852958, 0.01705916, 0.00852958, 0.03411832,\n", " 0.02558874, 0.01705916, 0.00852958, 0.00852958, 0.00852958,\n", " 0.03411832, 0.02558874, 0.03411832, 0.01705916, 0.00852958,\n", " 0.01705916, 0.0426479 , 0.03411832, 0.00852958, 0.00852958,\n", " 0.00852958, 0.05970706, 0.00852958, 0.0426479 , 0.01705916,\n", " 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.01705916,\n", " 0.00852958, 0.03411832, 0.02558874, 0.00852958, 0.00852958,\n", " 0.03411832, 0.01705916, 0.00852958, 0.03411832, 0.01705916,\n", " 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.27294655,\n", " 0.0426479 , 0.01705916, 0.00852958, 0.00852958, 0.01705916,\n", " 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.00852958,\n", " 0.00852958, 0.01705916, 0.00852958, 0.00852958, 0.00852958,\n", " 0.00852958, 0.01705916, 0.02558874, 0.00852958, 0.00852958,\n", " 0.01705916, 0.15353243, 0.00852958, 0.00852958, 0.03411832,\n", " 0.05970706, 0.00852958, 0.01705916, 0.00852958, 0.00852958,\n", " 0.03411832, 0.00852958, 0.00852958, 0.0426479 , 0.02558874,\n", " 0.01705916, 0.03411832, 0.01705916, 0.00852958, 0.01705916,\n", " 0.00852958, 0.00852958, 0.0426479 , 0.01705916, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.0426479 , 0.01705916,\n", " 0.00852958, 0.02558874, 0.01705916, 0.02558874, 0.02558874,\n", " 0.00852958, 0.01705916, 0.05970706, 0.28147613, 0.00852958,\n", " 0.06823664, 0.03411832, 0.00852958, 0.01705916, 0.01705916,\n", " 0.01705916, 0.00852958, 0.00852958, 0.03411832, 0.00852958,\n", " 0.06823664, 0.01705916, 0.00852958, 0.05970706, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.05117748, 0.00852958,\n", " 0.01705916, 0.02558874, 0.00852958, 0.00852958, 0.01705916,\n", " 0.18765075, 0.00852958, 0.00852958, 0.00852958, 0.01705916,\n", " 0.00852958, 0.02558874, 0.05117748, 0.00852958, 0.10235496,\n", " 0.00852958, 0.01705916, 0.01705916, 0.00852958, 0.01705916,\n", " 0.02558874, 0.00852958, 0.0852958 , 0.07676622, 0.03411832,\n", " 0.00852958, 0.14500285, 0.05117748, 0.00852958, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.0852958 , 0.00852958,\n", " 0.03411832, 0.00852958, 0.00852958, 0.09382538, 0.02558874,\n", " 0.00852958, 0.02558874, 0.01705916, 0.00852958, 0.00852958,\n", " 0.00852958, 0.03411832, 0.05970706, 0.01705916, 0.01705916,\n", " 0.0426479 , 0.00852958, 0.00852958, 0.00852958, 0.02558874,\n", " 0.02558874, 0.02558874, 0.0426479 , 0.01705916, 0.10235496,\n", " 0.0426479 , 0.00852958, 0.13647327, 0.01705916, 0.00852958,\n", " 0.00852958, 0.02558874, 0.00852958, 0.00852958, 0.03411832,\n", " 0.00852958, 0.00852958, 0.05117748, 0.01705916, 0.00852958,\n", " 0.00852958, 0.07676622, 0.06823664, 0.07676622, 0.05970706,\n", " 0.00852958, 0.01705916, 0.02558874, 0.00852958, 0.00852958,\n", " 0.00852958, 0.03411832, 0.02558874, 0.00852958, 0.01705916,\n", " 0.01705916, 0.00852958, 0.13647327, 0.05970706, 0.00852958,\n", " 0.01705916, 0.01705916, 0.00852958, 0.03411832, 0.10235496,\n", " 0.00852958, 0.0426479 , 0.03411832, 0.05117748, 0.00852958,\n", " 0.00852958, 0.00852958, 0.0426479 , 0.00852958, 0.00852958,\n", " 0.05117748, 0.05970706, 0.02558874, 0.00852958, 0.02558874,\n", " 0.02558874, 0.02558874, 0.00852958, 0.01705916, 0.0426479 ,\n", " 0.01705916, 0.00852958, 0.00852958, 0.06823664, 0.00852958,\n", " 0.00852958, 0.01705916, 0.00852958, 0.14500285, 0.0426479 ,\n", " 0.02558874, 0.02558874, 0.00852958, 0.00852958, 0.00852958,\n", " 0.01705916, 0.05970706, 0.00852958, 0.03411832, 0.00852958,\n", " 0.02558874, 0.00852958, 0.01705916, 0.16206201, 0.00852958,\n", " 0.0426479 , 0.00852958, 0.00852958, 0.00852958, 0.05970706,\n", " 0.01705916, 0.00852958, 0.01705916, 0.02558874, 0.01705916,\n", " 0.15353243, 0.01705916, 0.0426479 , 0.0426479 , 0.01705916,\n", " 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.09382538,\n", " 0.00852958, 0.00852958, 0.01705916, 0.06823664, 0.00852958,\n", " 0.00852958, 0.00852958, 0.01705916, 0.01705916, 0.01705916,\n", " 0.00852958, 0.00852958, 0.02558874, 0.00852958, 0.00852958,\n", " 0.00852958, 0.07676622, 0.06823664, 0.00852958, 0.00852958,\n", " 0.01705916, 0.01705916, 0.01705916, 0.00852958, 0.00852958,\n", " 0.00852958, 0.01705916, 0.01705916, 0.00852958, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.06823664, 0.00852958,\n", " 0.00852958, 0.00852958, 0.14500285, 0.01705916, 0.00852958,\n", " 0.03411832, 0.01705916, 0.00852958, 0.02558874, 0.0852958 ,\n", " 0.05117748, 0.02558874, 0.22176907, 0.06823664, 0.05117748,\n", " 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.01705916,\n", " 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.01705916,\n", " 0.07676622, 0.00852958, 0.00852958, 0.19618033, 0.00852958,\n", " 0.00852958, 0.02558874, 0.01705916, 0.00852958, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.03411832,\n", " 0.01705916, 0.02558874, 0.02558874, 0.01705916, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.07676622, 0.02558874,\n", " 0.00852958, 0.01705916, 0.00852958, 0.00852958, 0.00852958,\n", " 0.00852958, 0.17912117, 0.24735781, 0.02558874, 0.00852958,\n", " 0.01705916, 0.01705916, 0.02558874, 0.00852958, 0.13647327,\n", " 0.00852958, 0.01705916, 0.00852958, 0.00852958, 0.00852958,\n", " 0.00852958, 0.00852958, 0.03411832, 0.00852958, 0.03411832,\n", " 0.00852958, 0.0426479 , 0.05117748, 0.02558874, 0.01705916,\n", " 0.01705916, 0.05970706, 0.00852958, 0.02558874, 0.01705916,\n", " 0.00852958, 0.01705916, 0.00852958, 0.06823664, 0.09382538,\n", " 0.00852958, 0.00852958, 0.05117748, 0.00852958, 0.03411832,\n", " 0.00852958, 0.00852958, 0.02558874, 0.02558874, 0.02558874,\n", " 0.01705916, 0.00852958, 0.00852958, 0.00852958, 0.00852958,\n", " 0.00852958, 0.01705916, 0.00852958, 0.0426479 , 0.00852958,\n", " 0.05970706, 0.00852958, 0.12794369, 0.00852958, 0.03411832,\n", " 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.05970706,\n", " 0.00852958, 0.05970706, 0.05117748, 0.00852958, 0.00852958,\n", " 0.00852958, 0.00852958, 0.12794369, 0.00852958, 0.05970706,\n", " 0.05970706, 0.00852958, 0.0426479 , 0.00852958, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.00852958,\n", " 0.00852958, 0.03411832, 0.00852958, 0.00852958, 0.01705916,\n", " 0.00852958, 0.00852958, 0.02558874, 0.12794369, 0.00852958,\n", " 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.06823664,\n", " 0.00852958, 0.06823664]])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "tfidf_vec = TfidfVectorizer(stop_words='english')\n", "transformed = tfidf_vec.fit_transform(raw_documents = [document])\n", "transformed = np.array(transformed.todense())\n", "transformed" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "demand 0.281476\n", "business 0.272947\n", "samsung 0.247358\n", "products 0.221769\n", "quarter 0.196180\n", "earnings 0.187651\n", "sales 0.179121\n", "mobile 0.162062\n", "new 0.153532\n", "company 0.153532\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index_value = {i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}\n", "fully_indexed = {index_value[column]:value for row in transformed \n", " for (column,value) in enumerate(row)}\n", "\n", "token_tfidf = pd.Series(fully_indexed).sort_values(ascending=False)\n", "token_tfidf[:10]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }