{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"# **gensim | doc2vec**\n",
"
\n",
"## **1 네이버 리뷰 문장의 활용**\n",
"네이버 영화리뷰 단어모델 만들기"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from konlpy.tag import Twitter\n",
"twitter = Twitter()\n",
"\n",
"# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)\n",
"def read_data(filename):\n",
" with open(filename, 'r', encoding='utf-8') as f:\n",
" data = [line.split('\\t') for line in f.read().splitlines()]\n",
" \n",
" from random import randint\n",
" random_data = [data[randint(1, len(data))] for no in range(int(len(data)/10)) ]\n",
" return random_data\n",
"\n",
"# 한글 Token에 품사정보를 덧붙이기\n",
"def tokenize(doc):\n",
" return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"from collections import namedtuple\n",
"train_data = read_data('./data/ratings_train.txt')\n",
"train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]\n",
"TaggedDocument = namedtuple('TaggedDocument', 'words tags')\n",
"tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pprint import pprint\n",
"pprint(tagged_train_docs[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **2 doc2vec 파라미터 설정 및 학습**\n",
"**[세부적인 파라미터 설정](http://hero4earth.com/blog/projects/2018/01/21/naver_movie_review/)**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"from gensim.models import doc2vec\n",
"# doc_vectorizer = doc2vec.Doc2Vec(\n",
"# dm = 0, # PV-DBOW / default 1\n",
"# dbow_words = 1, # w2v simultaneous with DBOW d2v / default 0\n",
"# window = 8, # distance between the predicted word and context words\n",
"# vector_size = 300, # vector size\n",
"# alpha = 0.025, # learning-rate\n",
"# seed = 1234,\n",
"# min_count = 20, # ignore with freq lower\n",
"# min_alpha = 0.025, # min learning-rate\n",
"# workers = 4, # multi cpu\n",
"# hs = 1, # hierarchical softmax / default 0\n",
"# negative = 10, # negative sampling / default 5\n",
"# )\n",
"\n",
"# Doc2Vec 모델을 정의\n",
"doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)\n",
"\n",
"# 정의된 모델에 Token 자료를 연결\n",
"doc_vectorizer.build_vocab(tagged_train_docs)\n",
"\n",
"for epoch in range(10):\n",
" doc_vectorizer.train(tagged_train_docs, \n",
" total_examples = doc_vectorizer.corpus_count, \n",
" epochs = doc_vectorizer.epochs)\n",
" doc_vectorizer.alpha -= 0.002\n",
" doc_vectorizer.min_alpha = doc_vectorizer.alpha \n",
"\n",
"# 학습이 완료된 모델의 데이터를 저장한다\n",
"doc_vectorizer.save('./data/doc2vec.model')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
\n",
"## **3 doc2Vec 모델활용**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%reset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%who"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from gensim.models import doc2vec\n",
"from pprint import pprint\n",
"doc_vectorizer = doc2vec.Doc2Vec.load('./data/doc2vec.model')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pprint(doc_vectorizer.wv.most_similar('공포/Noun'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], \n",
" negative=['남자/Noun']))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}