{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"# **gensim | word2vec**\n",
"
\n",
"## **1 데이터 전처리**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['../data/kr-Report_2016.txt', '../data/kr-Report_2017.txt', '../data/kr-Report_2015.txt', '../data/kr-Report_2018.txt']\n"
]
},
{
"data": {
"text/plain": [
"'삼성전자\\n 지속가능경영보고서\\n 삼성전자 지속가능경영보고서\\n 삼성전자 지속가능경영 사람과 사회 환경 조화롭\\n 공존 발전\\n 가치 컬러 그래픽 조화롭 구성 심플 톤앤매너\\n 표지 전달 또한 인재 기술 바탕 최고 제품\\n 서비스 창출하 인류사회 공헌 경영이념 전개 사람 사회\\n 환경 포괄 영역 삼성전자 활동 세분 라인 조합\\n 완성 형태 움직임 통해 표현\\n 보고서 작성 개요\\n 삼성전자 경제 가치 환경 보호 사회 발전 지속가능경영 통해 세상 긍정\\n 가치 제공 지속가능경영 활동 성과 공개 바탕\\n 이해관계자 소통 위해 아홉 지속가능경영보고서 발간\\n 보'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import glob\n",
"from txtutil import txtnoun\n",
"# 2015 ~ 2018 지속가능 경영 보고서 Token을 수집\n",
"filelist = glob.glob('../data/kr-Report_201?.txt')\n",
"print(filelist)\n",
"\n",
"# 불러온 Document 명사Token만 추출\n",
"skiplist = {'갤러시':'갤럭시', '가치창출':'가치창출'}\n",
"texts = [txtnoun(file, skip=skiplist) for file in filelist]\n",
"texts = \" \".join(texts)\n",
"texts[:300]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# 명사 Token 작업된 자료를 ssResport.txt 로 저장 \n",
"texts_file = '../data/ssResport.txt'\n",
"with open(texts_file, 'w', encoding='utf-8') as file:\n",
" file.write(texts)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# ! cat ./data/ssResport.txt | head -n 10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **2 Word 2 vec 객체 만들기**\n",
"gensim"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# ! pip3 install gensim"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"model saved.\n",
"CPU times: user 1min 19s, sys: 749 ms, total: 1min 19s\n",
"Wall time: 32.3 s\n"
]
}
],
"source": [
"%%time\n",
"texts_file = '../data/ssResport.txt'\n",
"\n",
"from gensim.models import word2vec\n",
"data = word2vec.LineSentence(texts_file)\n",
"model = word2vec.Word2Vec(data, size=200, window=2, min_count=20, hs=1,\n",
" workers=4, iter=100, sg=1)\n",
"model.save(\"../data/ssReport.model\")\n",
"print(\"model saved.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **3 저장된 객체 활용**\n",
"gensim"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Once deleted, variables cannot be recovered. Proceed (y/[n])? y\n"
]
}
],
"source": [
"%reset"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Interactive namespace is empty.\n"
]
}
],
"source": [
"%who"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"927"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from gensim.models import word2vec\n",
"model = word2vec.Word2Vec.load('../data/ssReport.model')\n",
"len(model.wv.vocab.keys())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['삼성전자', '관리', '제품', '협력사', '임직원', '사업', '위해', '통해', '글로벌', '교육']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(model.wv.index2word)[:10]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('생각', 0.2546592652797699),\n",
" ('제조', 0.22521910071372986),\n",
" ('인사말', 0.2224113941192627),\n",
" ('지구', 0.22195805609226227),\n",
" ('전담조직', 0.211161807179451),\n",
" ('가장', 0.2056487500667572),\n",
" ('성장', 0.2023772895336151),\n",
" ('스스로', 0.19887249171733856),\n",
" ('스타트업', 0.19723141193389893),\n",
" ('노력', 0.19586186110973358)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(positive=['삼성전자'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('각주', 0.10347022861242294),\n",
" ('주기', 0.09186889231204987),\n",
" ('완료', 0.08225712180137634),\n",
" ('예정', 0.08068017661571503),\n",
" ('기타', 0.057173024863004684),\n",
" ('케냐', 0.04238442704081535),\n",
" ('에어컨', 0.040793925523757935),\n",
" ('적극', 0.03802839294075966),\n",
" ('변화', 0.03759170323610306),\n",
" ('훈련', 0.03625089302659035)]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(negative=['삼성전자'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('책임경영', 0.2668074667453766),\n",
" ('기업시민', 0.2409428507089615),\n",
" ('생산거점', 0.2276557981967926),\n",
" ('전문조직', 0.22614848613739014),\n",
" ('인재경영', 0.22412163019180298),\n",
" ('정도경영', 0.22231942415237427),\n",
" ('법인', 0.22023336589336395),\n",
" ('매출액', 0.2185959368944168),\n",
" ('해외', 0.2185892015695572),\n",
" ('국가별', 0.2148313820362091)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(positive=['글로벌'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('각주', 0.08886698633432388),\n",
" ('선택', 0.08736976981163025),\n",
" ('설비', 0.07159534096717834),\n",
" ('준수', 0.06772229075431824),\n",
" ('배터리', 0.06750676780939102),\n",
" ('전달', 0.06239762529730797),\n",
" ('위반', 0.060864850878715515),\n",
" ('검사', 0.056238118559122086),\n",
" ('개사', 0.05387230962514877),\n",
" ('가족', 0.052691079676151276)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(negative=['글로벌'])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('가치창출', 0.2803115248680115),\n",
" ('기업', 0.26225581765174866),\n",
" ('사업활동', 0.261832058429718),\n",
" ('인식', 0.24657797813415527),\n",
" ('생각', 0.23894557356834412),\n",
" ('기업시민', 0.23124957084655762),\n",
" ('협의체', 0.22006073594093323),\n",
" ('측면', 0.2161359190940857),\n",
" ('정량성과표', 0.21145324409008026),\n",
" ('폐기', 0.20632822811603546)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.wv.most_similar(positive=['삼성전자','경영활동'], \n",
" negative=['근무환경']) # 담당자, 직원"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **04 Visulaization**\n",
"gensim"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['삼성전자', '지속가능경영보고서', '지속가능경영', '사회', '환경', '발전', '가치', '구성', '전달', '또한']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(model.wv.vocab.keys())[:10]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
},
{
"data": {
"text/plain": [
"(927, 200)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# model.wv.vocab : { word: object of numeric vector }\n",
"vocab = list(model.wv.vocab)\n",
"X = model[vocab]\n",
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 17.4 s, sys: 1.34 s, total: 18.7 s\n",
"Wall time: 18.5 s\n"
]
}
],
"source": [
"%%time\n",
"from sklearn.manifold import TSNE\n",
"tsne = TSNE(n_components= 2)\n",
"X_tsne = tsne.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n", " | x | \n", "y | \n", "
---|---|---|
삼성전자 | \n", "2.595189 | \n", "2.475562 | \n", "
지속가능경영보고서 | \n", "0.375126 | \n", "67.879570 | \n", "
지속가능경영 | \n", "22.494762 | \n", "39.360992 | \n", "
사회 | \n", "0.464887 | \n", "6.196083 | \n", "
환경 | \n", "-5.406484 | \n", "0.462887 | \n", "