{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"# **gensim | word2vec**\n",
"
\n",
"## **1 데이터 전처리**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['../data/kr-Report_2016.txt', '../data/kr-Report_2017.txt', '../data/kr-Report_2015.txt', '../data/kr-Report_2018.txt']\n"
]
},
{
"data": {
"text/plain": [
"'삼성전자\\n 지속가능경영보고서\\n 삼성전자 지속가능경영보고서\\n 삼성전자 지속가능경영 사람과 사회 환경 조화롭\\n 공존 발전\\n 가치 컬러 그래픽 조화롭 구성 심플 톤앤매너\\n 표지 전달 또한 인재 기술 바탕 최고 제품\\n 서비스 창출하 인류사회 공헌 경영이념 전개 사람 사회\\n 환경 포괄 영역 삼성전자 활동 세분 라인 조합\\n 완성 형태 움직임 통해 표현\\n 보고서 작성 개요\\n 삼성전자 경제 가치 환경 보호 사회 발전 지속가능경영 통해 세상 긍정\\n 가치 제공 지속가능경영 활동 성과 공개 바탕\\n 이해관계자 소통 위해 아홉 지속가능경영보고서 발간\\n 보'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import glob\n",
"from txtutil import txtnoun\n",
"# 2015 ~ 2018 지속가능 경영 보고서 Token을 수집\n",
"filelist = glob.glob('../data/kr-Report_201?.txt')\n",
"print(filelist)\n",
"\n",
"# 불러온 Document 명사Token만 추출\n",
"skiplist = {'갤러시':'갤럭시', '가치창출':'가치창출'}\n",
"texts = [txtnoun(file, skip=skiplist) for file in filelist]\n",
"texts = \" \".join(texts)\n",
"texts[:300]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# 명사 Token 작업된 자료를 ssResport.txt 로 저장 \n",
"texts_file = '../data/ssResport.txt'\n",
"with open(texts_file, 'w', encoding='utf-8') as file:\n",
" file.write(texts)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# ! cat ./data/ssResport.txt | head -n 10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **2 Word 2 vec 객체 만들기**\n",
"gensim"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# ! pip3 install gensim"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"model saved.\n",
"CPU times: user 1min 17s, sys: 426 ms, total: 1min 18s\n",
"Wall time: 31.1 s\n"
]
}
],
"source": [
"%%time\n",
"texts_file = '../data/ssResport.txt'\n",
"\n",
"from gensim.models import word2vec\n",
"data = word2vec.LineSentence(texts_file)\n",
"model = word2vec.Word2Vec(data, size=200, window=2, min_count=20, hs=1,\n",
" workers=4, iter=100, sg=1)\n",
"model.save(\"../data/ssReport.model\")\n",
"print(\"model saved.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **3 저장된 객체 활용**\n",
"gensim"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Once deleted, variables cannot be recovered. Proceed (y/[n])? y\n"
]
}
],
"source": [
"%reset"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Interactive namespace is empty.\n"
]
}
],
"source": [
"%who"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"927"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from gensim.models import word2vec\n",
"model = word2vec.Word2Vec.load('../data/ssReport.model')\n",
"len(model.wv.vocab.keys())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['삼성전자', '관리', '제품', '협력사', '임직원', '사업', '위해', '통해', '글로벌', '교육']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(model.wv.index2word)[:10]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('생각', 0.2371642142534256),\n",
" ('인사말', 0.22413358092308044),\n",
" ('이니셔티브', 0.2232244610786438),\n",
" ('지구', 0.22301766276359558),\n",
" ('존중', 0.2171960026025772),\n",
" ('공헌', 0.21072670817375183),\n",
" ('가장', 0.21015311777591705),\n",
" ('행동', 0.2073962390422821),\n",
" ('제작', 0.20662453770637512),\n",
" ('제조', 0.20571179687976837)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(positive=['삼성전자'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('각주', 0.09539655596017838),\n",
" ('주기', 0.090581014752388),\n",
" ('인원', 0.06392117589712143),\n",
" ('훈련', 0.0467417910695076),\n",
" ('모바일', 0.04639029502868652),\n",
" ('에어컨', 0.046074528247117996),\n",
" ('처리', 0.0430106446146965),\n",
" ('변화', 0.0425986647605896),\n",
" ('기여', 0.04008210450410843),\n",
" ('소비자', 0.03669456019997597)]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(negative=['삼성전자'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('기업시민', 0.27174001932144165),\n",
" ('책임경영', 0.24501878023147583),\n",
" ('매출액', 0.23786215484142303),\n",
" ('의식', 0.23527146875858307),\n",
" ('디스플레이', 0.2316412329673767),\n",
" ('세계', 0.2269509881734848),\n",
" ('판매거점', 0.2236602008342743),\n",
" ('인재양성', 0.21667678654193878),\n",
" ('전세계', 0.21417926251888275),\n",
" ('법인', 0.21227560937404633)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(positive=['글로벌'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('각주', 0.14380551874637604),\n",
" ('선택', 0.10329953581094742),\n",
" ('가족', 0.08594916760921478),\n",
" ('최종', 0.0808052197098732),\n",
" ('검사', 0.08069653809070587),\n",
" ('판단', 0.07758765667676926),\n",
" ('설비', 0.07216480374336243),\n",
" ('혜택', 0.06981527805328369),\n",
" ('결정', 0.06863678246736526),\n",
" ('개사', 0.05936558544635773)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(negative=['글로벌'])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('가치창출', 0.275915265083313),\n",
" ('기업', 0.2585913836956024),\n",
" ('측면', 0.24071839451789856),\n",
" ('전력', 0.23596522212028503),\n",
" ('사업활동', 0.2355266958475113),\n",
" ('생각', 0.23054496943950653),\n",
" ('책임', 0.22358964383602142),\n",
" ('천톤', 0.2161034494638443),\n",
" ('행동', 0.21353504061698914),\n",
" ('인식', 0.2054463028907776)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(positive=['삼성전자','경영활동'], \n",
" negative=['근무환경']) # 담당자, 직원"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **04 Visulaization**\n",
"gensim"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['삼성전자', '지속가능경영보고서', '지속가능경영', '사회', '환경', '발전', '가치', '구성', '전달', '또한']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(model.wv.vocab.keys())[:10]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
},
{
"data": {
"text/plain": [
"(927, 200)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# model.wv.vocab : { word: object of numeric vector }\n",
"vocab = list(model.wv.vocab)\n",
"X = model[vocab]\n",
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 16.4 s, sys: 1.49 s, total: 17.9 s\n",
"Wall time: 17.9 s\n"
]
}
],
"source": [
"%%time\n",
"from sklearn.manifold import TSNE\n",
"tsne = TSNE(n_components = 2)\n",
"X_tsne = tsne.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n", " | x | \n", "y | \n", "
---|---|---|
삼성전자 | \n", "-0.172935 | \n", "-0.924926 | \n", "
지속가능경영보고서 | \n", "-2.630885 | \n", "-0.719760 | \n", "
지속가능경영 | \n", "-2.043247 | \n", "-0.660033 | \n", "
사회 | \n", "-1.052006 | \n", "-1.501886 | \n", "
환경 | \n", "-0.137400 | \n", "-0.759023 | \n", "