{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"# **gensim | doc2vec**\n",
"
\n",
"## **1 네이버 리뷰 문장의 활용**\n",
"네이버 영화리뷰 단어모델 만들기"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from konlpy.tag import Twitter\n",
"twitter = Twitter()\n",
"\n",
"# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)\n",
"def read_data(filename):\n",
" with open(filename, 'r', encoding='utf-8') as f:\n",
" data = [line.split('\\t') for line in f.read().splitlines()]\n",
" \n",
" from random import randint\n",
" random_data = [data[randint(1, len(data))] for no in range(int(len(data)/10)) ]\n",
" return random_data\n",
"\n",
"# 한글 Token에 품사정보를 덧붙이기\n",
"def tokenize(doc):\n",
" return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 24.6 s, sys: 335 ms, total: 24.9 s\n",
"Wall time: 17.5 s\n"
]
}
],
"source": [
"%%time\n",
"from collections import namedtuple\n",
"train_data = read_data('../data/ratings_train.txt')\n",
"train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]\n",
"TaggedDocument = namedtuple('TaggedDocument', 'words tags')\n",
"tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TaggedDocument(words=['제이슨/Noun', '스타덤/Noun', '의/Josa', '그/Noun', '흔하다/Adjective', '액션/Noun', '하나/Noun', '화끈/Noun', '하다/Verb', '보이다/Verb', '못/VerbPrefix', '하다/Verb', '.../Punctuation', '조연/Noun', '들/Suffix', '스토리/Noun', '도/Josa', '엉/Exclamation', '성하/Noun', '게/Josa', '마무리/Noun', '되다/Verb', ',/Punctuation', '실망/Noun', '가득하다/Adjective', '영화/Noun'], tags=['0'])\n"
]
}
],
"source": [
"from pprint import pprint\n",
"pprint(tagged_train_docs[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **2 doc2vec 파라미터 설정 및 학습**\n",
"**[세부적인 파라미터 설정](http://hero4earth.com/blog/projects/2018/01/21/naver_movie_review/)**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"doc2vec Model Saved\n",
"CPU times: user 55.9 s, sys: 3.72 s, total: 59.6 s\n",
"Wall time: 30.2 s\n"
]
}
],
"source": [
"%%time\n",
"from gensim.models import doc2vec\n",
"# doc_vectorizer = doc2vec.Doc2Vec(\n",
"# dm = 0, # PV-DBOW / default 1\n",
"# dbow_words = 1, # w2v simultaneous with DBOW d2v / default 0\n",
"# window = 8, # distance between the predicted word and context words\n",
"# vector_size = 300, # vector size\n",
"# alpha = 0.025, # learning-rate\n",
"# seed = 1234,\n",
"# min_count = 20, # ignore with freq lower\n",
"# min_alpha = 0.025, # min learning-rate\n",
"# workers = 4, # multi cpu\n",
"# hs = 1, # hierarchical softmax / default 0\n",
"# negative = 10, # negative sampling / default 5\n",
"# )\n",
"\n",
"# Doc2Vec 모델을 정의\n",
"doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)\n",
"\n",
"# 정의된 모델에 Token 자료를 연결\n",
"doc_vectorizer.build_vocab(tagged_train_docs)\n",
"\n",
"for epoch in range(10):\n",
" doc_vectorizer.train(tagged_train_docs, \n",
" total_examples = doc_vectorizer.corpus_count, \n",
" epochs = doc_vectorizer.epochs)\n",
" doc_vectorizer.alpha -= 0.002\n",
" doc_vectorizer.min_alpha = doc_vectorizer.alpha \n",
"\n",
"# 학습이 완료된 모델의 데이터를 저장한다\n",
"doc_vectorizer.save('../data/doc2vec.model')\n",
"print(\"doc2vec Model Saved\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **3 doc2Vec 모델활용**"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Once deleted, variables cannot be recovered. Proceed (y/[n])? y\n"
]
}
],
"source": [
"%reset"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Interactive namespace is empty.\n"
]
}
],
"source": [
"%who"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from gensim.models import doc2vec\n",
"from pprint import pprint\n",
"doc_vectorizer = doc2vec.Doc2Vec.load('../data/doc2vec.model')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('코미디/Noun', 0.4376811385154724),\n",
" ('스릴러/Noun', 0.43532365560531616),\n",
" ('스럽지도/Josa', 0.39870721101760864),\n",
" ('박진/Noun', 0.39721059799194336),\n",
" ('장르/Noun', 0.389506995677948),\n",
" ('액션영화/Noun', 0.3805934488773346),\n",
" ('블랙/Noun', 0.3729945719242096),\n",
" ('종교/Noun', 0.3727499842643738),\n",
" ('롭고/Josa', 0.3712288737297058),\n",
" ('풍자/Noun', 0.3630199432373047)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
}
],
"source": [
"pprint(doc_vectorizer.wv.most_similar('공포/Noun'))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"-0.012412064"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('스럽지도/Josa', 0.384132444858551),\n",
" ('장르/Noun', 0.3558204174041748),\n",
" ('코믹/Noun', 0.3339052200317383),\n",
" ('스릴러/Noun', 0.3245369791984558),\n",
" ('고어/Noun', 0.3172786235809326),\n",
" ('명분/Noun', 0.3150975704193115),\n",
" ('히어로/Noun', 0.31317320466041565),\n",
" ('로맨스/Noun', 0.3098544478416443),\n",
" ('복선/Noun', 0.29699862003326416),\n",
" ('신파/Noun', 0.2888486981391907)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
" if np.issubdtype(vec.dtype, np.int):\n"
]
}
],
"source": [
"pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], \n",
" negative=['남자/Noun']))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1.99763617e-03, 1.71703286e-03, -2.20827432e-03, 3.30096926e-03,\n",
" 3.01561877e-03, -1.26834167e-03, 1.21280085e-02, -1.89038850e-02,\n",
" 3.29405302e-05, 2.55695544e-03], dtype=float32)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"-0.15552977"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.03374508, -0.00827203, 0.0110028 , -0.00999906, -0.01591366,\n",
" 0.00750665, -0.00079125, 0.00492844, -0.00993689, -0.01647354],\n",
" dtype=float32)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.22114044"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}