{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "# **tf-idf**\n", "\n", "## **1 tf - idf 를 직접 구현하기**\n", "[**지속성장 경영 보고서**](https://images.samsung.com/is/content/samsung/p5/sec/aboutsamsung/2018/pdf/SustainabilityReport_2018_kr.pdf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Docs 의 list 목록을 만드는게 우선 일이다\n", "from txtutil import tf_idf\n", "tf_idf('갤럭시', '갤럭시 노트 신제품 출시', ['갤럭시','갤럭시','노트','신제품','출시','출시'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "from glob import glob\n", "filelist = glob('./data/kr-Report_201?.txt')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 불용어 자료를 활용하여 Token 을 Filtering \n", "# # stopwords.txt : 2015, 2016, 2017, 2018년 모두 존재하는 단어목록\n", "# f = open('./data/stopwords.txt', 'r', encoding='utf-8')\n", "# stopwords = f.read(); f.close()\n", "# stopwords = stopwords.split(' ')\n", "# stopwords[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "docs_tokens = []\n", "skips = {'갤러시':'갤럭시', '가치창출':'가치창출'}\n", "from txtutil import txtnoun\n", "from nltk.tokenize import word_tokenize\n", "for file in filelist:\n", " texts = txtnoun(file, skip=skips)\n", " tokens = word_tokenize(texts)\n", " tokens = [token for token in tokens \n", " if len(token) > 2] \n", " # if (len(token) > 2) and (token not in stopwords)]\n", " docs_tokens += tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk import FreqDist\n", "import pandas as pd\n", "pd.Series(FreqDist(docs_tokens)).sort_values(ascending=False)[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "## **2 tf-idf 를 분석할 대상문서 데이터 불러오기**\n", "tf-idf 분석할 대상문서 수집" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 분석할 대상 데이터\n", "texts = txtnoun('./data/kr-Report_2018.txt', skip=skips)\n", "tokens = word_tokenize(texts)\n", "tokens = [token for token in tokens \n", " if len(token) > 2] \n", " # if (len(token) > 2) and (token not in stopwords)]\n", "tokens[:7]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "from txtutil import tf_idf\n", "token_set = list(set(tokens))\n", "\n", "result_dict = {}\n", "for txt in token_set:\n", " result_dict[txt] = tf_idf(txt, tokens, docs_tokens)\n", "print('Calculating is Done.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2018년도 tf-idf\n", "# 생성한 TF-IDF 결과를 Pandas로 출력\n", "import pandas as pd\n", "tfidf = pd.Series(result_dict)\n", "tfidf.sort_values(ascending=False)[:20]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }