{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"# **tf-idf**\n",
"\n",
"## **1 Document 자료를 불러오기**\n",
"sklearn을 활용한 tf-idf 계산\n",
"[**연간 기업결과 리포트**](https://news.samsung.com/global/samsung-electronics-announces-fourth-quarter-and-fy-2017-results)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'samsung electronics posted krw 65.98 trillion in consolidated revenue and krw 15.15 trillion in operating profit for the fourth quarter of 2017.\\n \\noverall, the company reported full-year revenue of krw 239.58 trillion and full-year operating profit of krw 53.65 trillion.\\n \\nfourth quarter earnings we'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Document 자료를 불러온다 : 2017년 연간결산 리포트\n",
"with open('../data/News2017.txt', 'r', encoding='utf-8') as f:\n",
" texts = f.read()\n",
"\n",
"texts = texts.lower()\n",
"texts[:300]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'samsung electronics posted krw trillion in consolidated revenue and krw trillion in operating profit for the fourth quarter of overall the company reported full year revenue of krw trillion and full year operating profit of krw trillion fourth quarter earnings were driven by the components business '"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 영문 Token만 추출한다\n",
"from nltk.tokenize import RegexpTokenizer\n",
"re_capt = RegexpTokenizer(r'[a-z]\\w+')\n",
"tokens = re_capt.tokenize(texts)\n",
"document = \" \".join(tokens)\n",
"document[:300]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"the 153\n",
"and 100\n",
"to 95\n",
"in 77\n",
"for 74\n",
"of 53\n",
"will 37\n",
"as 35\n",
"demand 33\n",
"by 32\n",
"dtype: int64"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 추출한 Token의 빈도를 계산한다\n",
"from nltk import FreqDist\n",
"import pandas as pd\n",
"token_freq = FreqDist(tokens)\n",
"token_freq = pd.Series(token_freq).sort_values(ascending=False)\n",
"token_freq[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **2 sklean 을 활용한 tf idf 계산**\n",
"sklearn의 기본 데이터를 활용하여 tf-idf 결과값 출력"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# ! pip3 install sklearn"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# ! pip3 install scipy"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.01705916, 0.00852958, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.01705916, 0.00852958, 0.03411832,\n",
" 0.02558874, 0.01705916, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.03411832, 0.02558874, 0.03411832, 0.01705916, 0.00852958,\n",
" 0.01705916, 0.0426479 , 0.03411832, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.05970706, 0.00852958, 0.0426479 , 0.01705916,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.01705916,\n",
" 0.00852958, 0.03411832, 0.02558874, 0.00852958, 0.00852958,\n",
" 0.03411832, 0.01705916, 0.00852958, 0.03411832, 0.01705916,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.27294655,\n",
" 0.0426479 , 0.01705916, 0.00852958, 0.00852958, 0.01705916,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.00852958,\n",
" 0.00852958, 0.01705916, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.01705916, 0.02558874, 0.00852958, 0.00852958,\n",
" 0.01705916, 0.15353243, 0.00852958, 0.00852958, 0.03411832,\n",
" 0.05970706, 0.00852958, 0.01705916, 0.00852958, 0.00852958,\n",
" 0.03411832, 0.00852958, 0.00852958, 0.0426479 , 0.02558874,\n",
" 0.01705916, 0.03411832, 0.01705916, 0.00852958, 0.01705916,\n",
" 0.00852958, 0.00852958, 0.0426479 , 0.01705916, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.0426479 , 0.01705916,\n",
" 0.00852958, 0.02558874, 0.01705916, 0.02558874, 0.02558874,\n",
" 0.00852958, 0.01705916, 0.05970706, 0.28147613, 0.00852958,\n",
" 0.06823664, 0.03411832, 0.00852958, 0.01705916, 0.01705916,\n",
" 0.01705916, 0.00852958, 0.00852958, 0.03411832, 0.00852958,\n",
" 0.06823664, 0.01705916, 0.00852958, 0.05970706, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.05117748, 0.00852958,\n",
" 0.01705916, 0.02558874, 0.00852958, 0.00852958, 0.01705916,\n",
" 0.18765075, 0.00852958, 0.00852958, 0.00852958, 0.01705916,\n",
" 0.00852958, 0.02558874, 0.05117748, 0.00852958, 0.10235496,\n",
" 0.00852958, 0.01705916, 0.01705916, 0.00852958, 0.01705916,\n",
" 0.02558874, 0.00852958, 0.0852958 , 0.07676622, 0.03411832,\n",
" 0.00852958, 0.14500285, 0.05117748, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.0852958 , 0.00852958,\n",
" 0.03411832, 0.00852958, 0.00852958, 0.09382538, 0.02558874,\n",
" 0.00852958, 0.02558874, 0.01705916, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.03411832, 0.05970706, 0.01705916, 0.01705916,\n",
" 0.0426479 , 0.00852958, 0.00852958, 0.00852958, 0.02558874,\n",
" 0.02558874, 0.02558874, 0.0426479 , 0.01705916, 0.10235496,\n",
" 0.0426479 , 0.00852958, 0.13647327, 0.01705916, 0.00852958,\n",
" 0.00852958, 0.02558874, 0.00852958, 0.00852958, 0.03411832,\n",
" 0.00852958, 0.00852958, 0.05117748, 0.01705916, 0.00852958,\n",
" 0.00852958, 0.07676622, 0.06823664, 0.07676622, 0.05970706,\n",
" 0.00852958, 0.01705916, 0.02558874, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.03411832, 0.02558874, 0.00852958, 0.01705916,\n",
" 0.01705916, 0.00852958, 0.13647327, 0.05970706, 0.00852958,\n",
" 0.01705916, 0.01705916, 0.00852958, 0.03411832, 0.10235496,\n",
" 0.00852958, 0.0426479 , 0.03411832, 0.05117748, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.0426479 , 0.00852958, 0.00852958,\n",
" 0.05117748, 0.05970706, 0.02558874, 0.00852958, 0.02558874,\n",
" 0.02558874, 0.02558874, 0.00852958, 0.01705916, 0.0426479 ,\n",
" 0.01705916, 0.00852958, 0.00852958, 0.06823664, 0.00852958,\n",
" 0.00852958, 0.01705916, 0.00852958, 0.14500285, 0.0426479 ,\n",
" 0.02558874, 0.02558874, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.01705916, 0.05970706, 0.00852958, 0.03411832, 0.00852958,\n",
" 0.02558874, 0.00852958, 0.01705916, 0.16206201, 0.00852958,\n",
" 0.0426479 , 0.00852958, 0.00852958, 0.00852958, 0.05970706,\n",
" 0.01705916, 0.00852958, 0.01705916, 0.02558874, 0.01705916,\n",
" 0.15353243, 0.01705916, 0.0426479 , 0.0426479 , 0.01705916,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.09382538,\n",
" 0.00852958, 0.00852958, 0.01705916, 0.06823664, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.01705916, 0.01705916, 0.01705916,\n",
" 0.00852958, 0.00852958, 0.02558874, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.07676622, 0.06823664, 0.00852958, 0.00852958,\n",
" 0.01705916, 0.01705916, 0.01705916, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.01705916, 0.01705916, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.06823664, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.14500285, 0.01705916, 0.00852958,\n",
" 0.03411832, 0.01705916, 0.00852958, 0.02558874, 0.0852958 ,\n",
" 0.05117748, 0.02558874, 0.22176907, 0.06823664, 0.05117748,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.01705916,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.01705916,\n",
" 0.07676622, 0.00852958, 0.00852958, 0.19618033, 0.00852958,\n",
" 0.00852958, 0.02558874, 0.01705916, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.03411832,\n",
" 0.01705916, 0.02558874, 0.02558874, 0.01705916, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.07676622, 0.02558874,\n",
" 0.00852958, 0.01705916, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.17912117, 0.24735781, 0.02558874, 0.00852958,\n",
" 0.01705916, 0.01705916, 0.02558874, 0.00852958, 0.13647327,\n",
" 0.00852958, 0.01705916, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.03411832, 0.00852958, 0.03411832,\n",
" 0.00852958, 0.0426479 , 0.05117748, 0.02558874, 0.01705916,\n",
" 0.01705916, 0.05970706, 0.00852958, 0.02558874, 0.01705916,\n",
" 0.00852958, 0.01705916, 0.00852958, 0.06823664, 0.09382538,\n",
" 0.00852958, 0.00852958, 0.05117748, 0.00852958, 0.03411832,\n",
" 0.00852958, 0.00852958, 0.02558874, 0.02558874, 0.02558874,\n",
" 0.01705916, 0.00852958, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.01705916, 0.00852958, 0.0426479 , 0.00852958,\n",
" 0.05970706, 0.00852958, 0.12794369, 0.00852958, 0.03411832,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.01705916, 0.05970706,\n",
" 0.00852958, 0.05970706, 0.05117748, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.12794369, 0.00852958, 0.05970706,\n",
" 0.05970706, 0.00852958, 0.0426479 , 0.00852958, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.00852958,\n",
" 0.00852958, 0.03411832, 0.00852958, 0.00852958, 0.01705916,\n",
" 0.00852958, 0.00852958, 0.02558874, 0.12794369, 0.00852958,\n",
" 0.00852958, 0.00852958, 0.00852958, 0.00852958, 0.06823664,\n",
" 0.00852958, 0.06823664]])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"tfidf_vec = TfidfVectorizer(stop_words='english')\n",
"transformed = tfidf_vec.fit_transform(raw_documents = [document])\n",
"transformed = np.array(transformed.todense())\n",
"transformed"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"demand 0.281476\n",
"business 0.272947\n",
"samsung 0.247358\n",
"products 0.221769\n",
"quarter 0.196180\n",
"earnings 0.187651\n",
"sales 0.179121\n",
"mobile 0.162062\n",
"new 0.153532\n",
"company 0.153532\n",
"dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"index_value = {i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}\n",
"fully_indexed = {index_value[column]:value for row in transformed \n",
" for (column,value) in enumerate(row)}\n",
"\n",
"token_tfidf = pd.Series(fully_indexed).sort_values(ascending=False)\n",
"token_tfidf[:10]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}