{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "# **gensim | doc2vec**\n", "

\n", "## **1 네이버 리뷰 문장의 활용**\n", "네이버 영화리뷰 단어모델 만들기" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from konlpy.tag import Twitter\n", "twitter = Twitter()\n", "\n", "# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)\n", "def read_data(filename):\n", " with open(filename, 'r', encoding='utf-8') as f:\n", " data = [line.split('\\t') for line in f.read().splitlines()]\n", " \n", " from random import randint\n", " random_data = [data[randint(1, len(data))] for no in range(int(len(data)/10)) ]\n", " return random_data\n", "\n", "# 한글 Token에 품사정보를 덧붙이기\n", "def tokenize(doc):\n", " return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "from collections import namedtuple\n", "train_data = read_data('./data/ratings_train.txt')\n", "train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]\n", "TaggedDocument = namedtuple('TaggedDocument', 'words tags')\n", "tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pprint import pprint\n", "pprint(tagged_train_docs[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "## **2 doc2vec 파라미터 설정 및 학습**\n", "**[세부적인 파라미터 설정](http://hero4earth.com/blog/projects/2018/01/21/naver_movie_review/)**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "from gensim.models import doc2vec\n", "# doc_vectorizer = doc2vec.Doc2Vec(\n", "# dm = 0, # PV-DBOW / default 1\n", "# dbow_words = 1, # w2v simultaneous with DBOW d2v / default 0\n", "# window = 8, # distance between the predicted word and context words\n", "# vector_size = 300, # vector size\n", "# alpha = 0.025, # learning-rate\n", "# seed = 1234,\n", "# min_count = 20, # ignore with freq lower\n", "# min_alpha = 0.025, # min learning-rate\n", "# workers = 4, # multi cpu\n", "# hs = 1, # hierarchical softmax / default 0\n", "# negative = 10, # negative sampling / default 5\n", "# )\n", "\n", "# Doc2Vec 모델을 정의\n", "doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)\n", "\n", "# 정의된 모델에 Token 자료를 연결\n", "doc_vectorizer.build_vocab(tagged_train_docs)\n", "\n", "for epoch in range(10):\n", " doc_vectorizer.train(tagged_train_docs, \n", " total_examples = doc_vectorizer.corpus_count, \n", " epochs = doc_vectorizer.epochs)\n", " doc_vectorizer.alpha -= 0.002\n", " doc_vectorizer.min_alpha = doc_vectorizer.alpha \n", "\n", "# 학습이 완료된 모델의 데이터를 저장한다\n", "doc_vectorizer.save('./data/doc2vec.model')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "## **3 doc2Vec 모델활용**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%who" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from gensim.models import doc2vec\n", "from pprint import pprint\n", "doc_vectorizer = doc2vec.Doc2Vec.load('./data/doc2vec.model')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pprint(doc_vectorizer.wv.most_similar('공포/Noun'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], \n", " negative=['남자/Noun']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }