{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br></br>\n",
    "# **gensim | doc2vec**\n",
    "<br></br>\n",
    "## **1 네이버 리뷰 문장의 활용**\n",
    "네이버 영화리뷰 단어모델 만들기"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from konlpy.tag import Twitter\n",
    "twitter = Twitter()\n",
    "\n",
    "# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)\n",
    "def read_data(filename):\n",
    "    with open(filename, 'r', encoding='utf-8') as f:\n",
    "        data = [line.split('\\t') for line in f.read().splitlines()]\n",
    "        \n",
    "    from random import randint\n",
    "    random_data = [data[randint(1, len(data))]  for no in range(int(len(data)/10)) ]\n",
    "    return random_data\n",
    "\n",
    "# 한글 Token에 품사정보를 덧붙이기\n",
    "def tokenize(doc):\n",
    "    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 24.6 s, sys: 335 ms, total: 24.9 s\n",
      "Wall time: 17.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from collections import namedtuple\n",
    "train_data        = read_data('../data/ratings_train.txt')\n",
    "train_docs        = [(tokenize(row[1]), row[2]) for row in train_data[1:]]\n",
    "TaggedDocument    = namedtuple('TaggedDocument', 'words tags')\n",
    "tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TaggedDocument(words=['제이슨/Noun', '스타덤/Noun', '의/Josa', '그/Noun', '흔하다/Adjective', '액션/Noun', '하나/Noun', '화끈/Noun', '하다/Verb', '보이다/Verb', '못/VerbPrefix', '하다/Verb', '.../Punctuation', '조연/Noun', '들/Suffix', '스토리/Noun', '도/Josa', '엉/Exclamation', '성하/Noun', '게/Josa', '마무리/Noun', '되다/Verb', ',/Punctuation', '실망/Noun', '가득하다/Adjective', '영화/Noun'], tags=['0'])\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "pprint(tagged_train_docs[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br></br>\n",
    "## **2 doc2vec 파라미터 설정 및 학습**\n",
    "**[세부적인 파라미터 설정](http://hero4earth.com/blog/projects/2018/01/21/naver_movie_review/)**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "doc2vec Model Saved\n",
      "CPU times: user 55.9 s, sys: 3.72 s, total: 59.6 s\n",
      "Wall time: 30.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from gensim.models import doc2vec\n",
    "# doc_vectorizer = doc2vec.Doc2Vec(\n",
    "#     dm          = 0,     # PV-DBOW / default 1\n",
    "#     dbow_words  = 1,     # w2v simultaneous with DBOW d2v / default 0\n",
    "#     window      = 8,     # distance between the predicted word and context words\n",
    "#     vector_size = 300,   # vector size\n",
    "#     alpha       = 0.025, # learning-rate\n",
    "#     seed        = 1234,\n",
    "#     min_count   = 20,    # ignore with freq lower\n",
    "#     min_alpha   = 0.025, # min learning-rate\n",
    "#     workers     = 4,     # multi cpu\n",
    "#     hs          = 1,     # hierarchical softmax / default 0\n",
    "#     negative    = 10,    # negative sampling / default 5\n",
    "# )\n",
    "\n",
    "# Doc2Vec 모델을 정의\n",
    "doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)\n",
    "\n",
    "# 정의된 모델에 Token 자료를 연결\n",
    "doc_vectorizer.build_vocab(tagged_train_docs)\n",
    "\n",
    "for epoch in range(10):\n",
    "    doc_vectorizer.train(tagged_train_docs, \n",
    "                         total_examples = doc_vectorizer.corpus_count, \n",
    "                         epochs = doc_vectorizer.epochs)\n",
    "    doc_vectorizer.alpha -= 0.002\n",
    "    doc_vectorizer.min_alpha = doc_vectorizer.alpha \n",
    "\n",
    "# 학습이 완료된 모델의 데이터를 저장한다\n",
    "doc_vectorizer.save('../data/doc2vec.model')\n",
    "print(\"doc2vec Model Saved\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br></br>\n",
    "## **3 doc2Vec 모델활용**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Once deleted, variables cannot be recovered. Proceed (y/[n])? y\n"
     ]
    }
   ],
   "source": [
    "%reset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Interactive namespace is empty.\n"
     ]
    }
   ],
   "source": [
    "%who"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models import doc2vec\n",
    "from pprint import pprint\n",
    "doc_vectorizer = doc2vec.Doc2Vec.load('../data/doc2vec.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('코미디/Noun', 0.4376811385154724),\n",
      " ('스릴러/Noun', 0.43532365560531616),\n",
      " ('스럽지도/Josa', 0.39870721101760864),\n",
      " ('박진/Noun', 0.39721059799194336),\n",
      " ('장르/Noun', 0.389506995677948),\n",
      " ('액션영화/Noun', 0.3805934488773346),\n",
      " ('블랙/Noun', 0.3729945719242096),\n",
      " ('종교/Noun', 0.3727499842643738),\n",
      " ('롭고/Josa', 0.3712288737297058),\n",
      " ('풍자/Noun', 0.3630199432373047)]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
      "  if np.issubdtype(vec.dtype, np.int):\n"
     ]
    }
   ],
   "source": [
    "pprint(doc_vectorizer.wv.most_similar('공포/Noun'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
      "  if np.issubdtype(vec.dtype, np.int):\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "-0.012412064"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('스럽지도/Josa', 0.384132444858551),\n",
      " ('장르/Noun', 0.3558204174041748),\n",
      " ('코믹/Noun', 0.3339052200317383),\n",
      " ('스릴러/Noun', 0.3245369791984558),\n",
      " ('고어/Noun', 0.3172786235809326),\n",
      " ('명분/Noun', 0.3150975704193115),\n",
      " ('히어로/Noun', 0.31317320466041565),\n",
      " ('로맨스/Noun', 0.3098544478416443),\n",
      " ('복선/Noun', 0.29699862003326416),\n",
      " ('신파/Noun', 0.2888486981391907)]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
      "  if np.issubdtype(vec.dtype, np.int):\n"
     ]
    }
   ],
   "source": [
    "pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], \n",
    "                                      negative=['남자/Noun']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1.99763617e-03,  1.71703286e-03, -2.20827432e-03,  3.30096926e-03,\n",
       "        3.01561877e-03, -1.26834167e-03,  1.21280085e-02, -1.89038850e-02,\n",
       "        3.29405302e-05,  2.55695544e-03], dtype=float32)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.15552977"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.03374508, -0.00827203,  0.0110028 , -0.00999906, -0.01591366,\n",
       "        0.00750665, -0.00079125,  0.00492844, -0.00993689, -0.01647354],\n",
       "      dtype=float32)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.22114044"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}