{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "# **gensim | doc2vec**\n", "

\n", "## **1 네이버 리뷰 문장의 활용**\n", "네이버 영화리뷰 단어모델 만들기" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from konlpy.tag import Twitter\n", "twitter = Twitter()\n", "\n", "# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)\n", "def read_data(filename):\n", " with open(filename, 'r', encoding='utf-8') as f:\n", " data = [line.split('\\t') for line in f.read().splitlines()]\n", " \n", " from random import randint\n", " random_data = [data[randint(1, len(data))] for no in range(int(len(data)/10)) ]\n", " return random_data\n", "\n", "# 한글 Token에 품사정보를 덧붙이기\n", "def tokenize(doc):\n", " return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 24.6 s, sys: 335 ms, total: 24.9 s\n", "Wall time: 17.5 s\n" ] } ], "source": [ "%%time\n", "from collections import namedtuple\n", "train_data = read_data('../data/ratings_train.txt')\n", "train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]\n", "TaggedDocument = namedtuple('TaggedDocument', 'words tags')\n", "tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TaggedDocument(words=['제이슨/Noun', '스타덤/Noun', '의/Josa', '그/Noun', '흔하다/Adjective', '액션/Noun', '하나/Noun', '화끈/Noun', '하다/Verb', '보이다/Verb', '못/VerbPrefix', '하다/Verb', '.../Punctuation', '조연/Noun', '들/Suffix', '스토리/Noun', '도/Josa', '엉/Exclamation', '성하/Noun', '게/Josa', '마무리/Noun', '되다/Verb', ',/Punctuation', '실망/Noun', '가득하다/Adjective', '영화/Noun'], tags=['0'])\n" ] } ], "source": [ "from pprint import pprint\n", "pprint(tagged_train_docs[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "## **2 doc2vec 파라미터 설정 및 학습**\n", "**[세부적인 파라미터 설정](http://hero4earth.com/blog/projects/2018/01/21/naver_movie_review/)**" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "doc2vec Model Saved\n", "CPU times: user 55.9 s, sys: 3.72 s, total: 59.6 s\n", "Wall time: 30.2 s\n" ] } ], "source": [ "%%time\n", "from gensim.models import doc2vec\n", "# doc_vectorizer = doc2vec.Doc2Vec(\n", "# dm = 0, # PV-DBOW / default 1\n", "# dbow_words = 1, # w2v simultaneous with DBOW d2v / default 0\n", "# window = 8, # distance between the predicted word and context words\n", "# vector_size = 300, # vector size\n", "# alpha = 0.025, # learning-rate\n", "# seed = 1234,\n", "# min_count = 20, # ignore with freq lower\n", "# min_alpha = 0.025, # min learning-rate\n", "# workers = 4, # multi cpu\n", "# hs = 1, # hierarchical softmax / default 0\n", "# negative = 10, # negative sampling / default 5\n", "# )\n", "\n", "# Doc2Vec 모델을 정의\n", "doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)\n", "\n", "# 정의된 모델에 Token 자료를 연결\n", "doc_vectorizer.build_vocab(tagged_train_docs)\n", "\n", "for epoch in range(10):\n", " doc_vectorizer.train(tagged_train_docs, \n", " total_examples = doc_vectorizer.corpus_count, \n", " epochs = doc_vectorizer.epochs)\n", " doc_vectorizer.alpha -= 0.002\n", " doc_vectorizer.min_alpha = doc_vectorizer.alpha \n", "\n", "# 학습이 완료된 모델의 데이터를 저장한다\n", "doc_vectorizer.save('../data/doc2vec.model')\n", "print(\"doc2vec Model Saved\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "## **3 doc2Vec 모델활용**" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Once deleted, variables cannot be recovered. Proceed (y/[n])? y\n" ] } ], "source": [ "%reset" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Interactive namespace is empty.\n" ] } ], "source": [ "%who" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from gensim.models import doc2vec\n", "from pprint import pprint\n", "doc_vectorizer = doc2vec.Doc2Vec.load('../data/doc2vec.model')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('코미디/Noun', 0.4376811385154724),\n", " ('스릴러/Noun', 0.43532365560531616),\n", " ('스럽지도/Josa', 0.39870721101760864),\n", " ('박진/Noun', 0.39721059799194336),\n", " ('장르/Noun', 0.389506995677948),\n", " ('액션영화/Noun', 0.3805934488773346),\n", " ('블랙/Noun', 0.3729945719242096),\n", " ('종교/Noun', 0.3727499842643738),\n", " ('롭고/Josa', 0.3712288737297058),\n", " ('풍자/Noun', 0.3630199432373047)]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", " if np.issubdtype(vec.dtype, np.int):\n" ] } ], "source": [ "pprint(doc_vectorizer.wv.most_similar('공포/Noun'))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", " if np.issubdtype(vec.dtype, np.int):\n" ] }, { "data": { "text/plain": [ "-0.012412064" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('스럽지도/Josa', 0.384132444858551),\n", " ('장르/Noun', 0.3558204174041748),\n", " ('코믹/Noun', 0.3339052200317383),\n", " ('스릴러/Noun', 0.3245369791984558),\n", " ('고어/Noun', 0.3172786235809326),\n", " ('명분/Noun', 0.3150975704193115),\n", " ('히어로/Noun', 0.31317320466041565),\n", " ('로맨스/Noun', 0.3098544478416443),\n", " ('복선/Noun', 0.29699862003326416),\n", " ('신파/Noun', 0.2888486981391907)]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", " if np.issubdtype(vec.dtype, np.int):\n" ] } ], "source": [ "pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], \n", " negative=['남자/Noun']))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1.99763617e-03, 1.71703286e-03, -2.20827432e-03, 3.30096926e-03,\n", " 3.01561877e-03, -1.26834167e-03, 1.21280085e-02, -1.89038850e-02,\n", " 3.29405302e-05, 2.55695544e-03], dtype=float32)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.15552977" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.03374508, -0.00827203, 0.0110028 , -0.00999906, -0.01591366,\n", " 0.00750665, -0.00079125, 0.00492844, -0.00993689, -0.01647354],\n", " dtype=float32)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.22114044" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }