{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# **고급 Feacture 엔지니어링**\n", "## **1 Gensim 을 활용한 NLP 알고리즘**\n", "Word2Vec\n", "1. ! pip install gensim" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('chryses', 0.6371129751205444),\n", " ('priest', 0.6282371282577515),\n", " ('nymph', 0.6165897250175476),\n", " ('thanks', 0.6120550632476807),\n", " ('dishonored', 0.6062030792236328),\n", " ('narrate', 0.605045735836029),\n", " ('angered', 0.6038438677787781),\n", " ('chieftains', 0.6015218496322632),\n", " ('appease', 0.6003137826919556),\n", " ('akhilleus', 0.6002722978591919)]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from gensim.models import Word2Vec\n", "w2v_model = Word2Vec.load('../backup/model.bin')\n", "w2v_model.wv.most_similar(positive=['woman','king'], negative=['man'])" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.3998656" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_model.wv.similarity('woman','man')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "11098" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sorted(w2v_model.wv.vocab.keys(), reverse=False)[:14]\n", "len(w2v_model.wv.vocab.keys())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **2 Gensim 을 활용한 유사도 분석실습**\n", "Word2Vec\n", "1. ! pip install gensim\n", "1. **\"Ice\"** 와 **\"Fire\"** 의 문학내 유사도 측정" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('principals', 0.9933313131332397),\n", " ('threatening', 0.9859204292297363),\n", " ('distorting', 0.9620187878608704),\n", " ('freeport', 0.9174789190292358),\n", " ('stood', 0.8492887616157532),\n", " ('extend', 0.8167773485183716),\n", " ('douglas', 0.767949104309082),\n", " ('1858', 0.7635020017623901),\n", " ('conspiracy', 0.762062668800354),\n", " ('values', 0.7606658339500427)]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_model.wv.most_similar('stark')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

\n", "# **The Simpson 대본의 학습 및 분석**\n", "심슨 에피소드의 대본을 활용한 문장내 단어의 의미 분석 Totorial 입니다.\n", "1. **[Kaggle Tutorial](https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial)**\n", "1. **[Simpson Script](https://www.kaggle.com/pierremegret/dialogue-lines-of-the-simpsons)**" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import pandas as pd\n", "from time import time\n", "\n", "# Setting up the loggings to monitor gensim\n", "logging.basicConfig(format=\"%(levelname)s - %(asctime)s: %(message)s\", \n", " datefmt= '%H:%M:%S', level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **1 PreProcessing**\n", "데이터 전처리 작업의 진행" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(158314, 2)\n", "raw_character_text 17814\n", "spoken_words 26459\n", "dtype: int64\n", "(131853, 2)\n", "raw_character_text 0\n", "spoken_words 0\n", "dtype: int64\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
raw_character_textspoken_words
0Miss HooverNo, actually, it was a little of both. Sometim...
1Lisa SimpsonWhere's Mr. Bergstrom?
2Miss HooverI don't know. Although I'd sure like to talk t...
\n", "
" ], "text/plain": [ " raw_character_text spoken_words\n", "0 Miss Hoover No, actually, it was a little of both. Sometim...\n", "1 Lisa Simpson Where's Mr. Bergstrom?\n", "2 Miss Hoover I don't know. Although I'd sure like to talk t..." ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../backup/simpsons_dataset.csv')\n", "print(df.shape)\n", "\n", "# NaN, Null 데이터를 제거 합니다\n", "print(df.isnull().sum())\n", "df = df.dropna().reset_index(drop=True)\n", "print(df.shape)\n", "print(df.isnull().sum())\n", "df.head(3)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "131853it [01:07, 1961.82it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(85960, 1)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
raw_character_textspoken_words
0Miss HooverNo, actually, it was a little of both. Sometim...
1Lisa SimpsonWhere's Mr. Bergstrom?
2Miss HooverI don't know. Although I'd sure like to talk t...
\n", "
" ], "text/plain": [ " raw_character_text spoken_words\n", "0 Miss Hoover No, actually, it was a little of both. Sometim...\n", "1 Lisa Simpson Where's Mr. Bergstrom?\n", "2 Miss Hoover I don't know. Although I'd sure like to talk t..." ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re, spacy\n", "def cleaning(doc):\n", " txt = [token.lemma_ for token in doc if not token.is_stop]\n", " if len(txt) > 2: # 2글자 이상 단어만 선별\n", " return ' '.join(txt)\n", " \n", "brief_cleaning = (re.sub(\"[^A-Za-z']+\", ' ', str(row)).lower() \n", " for row in df['spoken_words'])\n", "\n", "# Spacy의 PipeLine 을 활용하여 표제어로 변경 및 Stopword 를 제거 합니다.\n", "from tqdm import tqdm\n", "nlp = spacy.load('en', disable=['ner', 'parser']) # stopword 필터링 파이프라인\n", "txt = [cleaning(doc) for doc in tqdm(nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1))]\n", "\n", "# 데이터 없는 행제거 및 전처리 완료 된 테이블을 출력 합니다\n", "df_clean = pd.DataFrame({'clean': txt})\n", "df_clean = df_clean.dropna().drop_duplicates()\n", "print(df_clean.shape)\n", "df.head(3)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(85960, 1)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clean
0actually little disease magazine news show nat...
2know sure like talk touch lesson plan teach
3life worth live
\n", "
" ], "text/plain": [ " clean\n", "0 actually little disease magazine news show nat...\n", "2 know sure like talk touch lesson plan teach\n", "3 life worth live" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_clean = pd.DataFrame({'clean': txt})\n", "df_clean = df_clean.dropna().drop_duplicates()\n", "print(df_clean.shape)\n", "df_clean.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **2 N-Gram Modeling**\n", "**\"mr_burns\", \"bart_simpson\"** 과 같은 **bi-gram** 모델을 만듭니다." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['actually',\n", " 'little',\n", " 'disease',\n", " 'magazine',\n", " 'news',\n", " 'show',\n", " 'natural',\n", " 'think'],\n", " ['know', 'sure', 'like', 'talk', 'touch', 'lesson', 'plan', 'teach'],\n", " ['life', 'worth', 'live']]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sent = [row.split() for row in df_clean['clean']]\n", "sent[:3]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO - 15:35:52: collecting all words and their counts\n", "INFO - 15:35:52: PROGRESS: at sentence #0, processed 0 words and 0 word types\n", "INFO - 15:35:52: PROGRESS: at sentence #10000, processed 63561 words and 52716 word types\n", "INFO - 15:35:52: PROGRESS: at sentence #20000, processed 130949 words and 99637 word types\n", "INFO - 15:35:52: PROGRESS: at sentence #30000, processed 192972 words and 138212 word types\n", "INFO - 15:35:52: PROGRESS: at sentence #40000, processed 249845 words and 172230 word types\n", "INFO - 15:35:52: PROGRESS: at sentence #50000, processed 311277 words and 208051 word types\n", "INFO - 15:35:52: PROGRESS: at sentence #60000, processed 373597 words and 243068 word types\n", "INFO - 15:35:53: PROGRESS: at sentence #70000, processed 436446 words and 278001 word types\n", "INFO - 15:35:53: PROGRESS: at sentence #80000, processed 497916 words and 311099 word types\n", "INFO - 15:35:53: collected 329869 word types from a corpus of 537147 words (unigram + bigrams) and 85960 sentences\n", "INFO - 15:35:53: using 329869 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>\n", "INFO - 15:35:53: source_vocab length 329869\n", "INFO - 15:35:56: Phraser built with 126 phrasegrams\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Phrases() 모듈에는 list로 구성된 텍스트를 바로 입력 가능 합니다\n", "# Phraser() 를 사용하는 이유는 Phrases() 의 메모리 차지를 줄입니다\n", "\n", "from gensim.models.phrases import Phrases, Phraser\n", "phrases = Phrases(sent, min_count=30, progress_per=10000)\n", "bigram = Phraser(phrases)\n", "sentences = bigram[sent]\n", "sentences" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **3 빈도 수의 계산 및 모델링**\n", "Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29643\n" ] }, { "data": { "text/plain": [ "['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "\n", "word_freq = defaultdict(int)\n", "for sent in sentences:\n", " for i in sent:\n", " word_freq[i] += 1\n", "print(len(word_freq))\n", "sorted(word_freq, key=word_freq.get, reverse=True)[:10]\n", "# \" // \".join(word_freq.keys())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **4 W2V 모델의 학습**\n", "Gensim 에 내장된 **[word2vec](https://radimrehurek.com/gensim/models/word2vec.html)** 로 학습 합니다. 3 단계로 구분한 뒤 단계별 모니터링 하면서 작업을 진행 합니다.\n", "1. Word2Vec():\n", "\n", " 이 첫 번째 단계에서는 모델의 매개 변수를 하나씩 설정합니다.\n", " 매개 변수 문장을 제공하지 않으므로 의도적으로 모델을 초기화하지 않은 상태로 둡니다.\n", "\n", "1. .build_vocab():\n", "\n", " 여기에서는 일련의 문장으로 어휘를 구성하여 모델을 초기화했습니다.\n", " 로깅을 통해 단어 진행에 대한 min_count 및 sample의 효과와 진행 상황을 더 중요하게 따를 수 있습니다. \n", " 특히 샘플은 모델의 성능에 큰 영향을 미치는 것으로 나타났습니다.\n", " \n", "1. .train():\n", "\n", " Finally, trains the model.\n", " The loggings here are mainly useful for monitoring, making sure that no threads are executed instantaneously.\n", " \n", "개별 Parametor 는 다음과 같습니다.\n", "1. **min_count = int :** Ignores all words with total absolute frequency lower than this - (2, 100)\n", "1. **window = int :** The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)\n", "1. **size = int :** Dimensionality of the feature vectors. - (50, 300)\n", "1. **sample = float :** The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)\n", "1. **alpha = float :** The initial learning rate - (0.01, 0.05)\n", "1. **min_alpha = float :** Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00\n", "1. **negative = int :** If > 0, negative sampling will be used, the int for negative specifies how many \"noise words\" should be drown. If set to 0, no negative sampling is used. - (5, 20)\n", "1. **workers = int :** Use these many worker threads to train the model (=faster training with multicore machines)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import multiprocessing\n", "# 컴퓨터에서 연산 가능한 core 숫자를 호출 합니다\n", "cores = multiprocessing.cpu_count() \n", "cores" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO - 15:35:58: collecting all words and their counts\n", "INFO - 15:35:58: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", "INFO - 15:35:58: PROGRESS: at sentence #10000, processed 61706 words, keeping 9491 word types\n", "INFO - 15:35:59: PROGRESS: at sentence #20000, processed 127342 words, keeping 14373 word types\n", "INFO - 15:35:59: PROGRESS: at sentence #30000, processed 187807 words, keeping 17431 word types\n", "INFO - 15:35:59: PROGRESS: at sentence #40000, processed 243316 words, keeping 20124 word types\n", "INFO - 15:35:59: PROGRESS: at sentence #50000, processed 303167 words, keeping 22558 word types\n", "INFO - 15:35:59: PROGRESS: at sentence #60000, processed 363915 words, keeping 24804 word types\n", "INFO - 15:36:00: PROGRESS: at sentence #70000, processed 425375 words, keeping 26960 word types\n", "INFO - 15:36:00: PROGRESS: at sentence #80000, processed 485514 words, keeping 28777 word types\n", "INFO - 15:36:00: collected 29643 word types from a corpus of 523645 raw words and 85960 sentences\n", "INFO - 15:36:00: Loading a fresh vocabulary\n", "INFO - 15:36:00: effective_min_count=20 retains 3315 unique words (11% of original 29643, drops 26328)\n", "INFO - 15:36:00: effective_min_count=20 leaves 437848 word corpus (83% of original 523645, drops 85797)\n", "INFO - 15:36:00: deleting the raw counts dictionary of 29643 items\n", "INFO - 15:36:00: sample=6e-05 downsamples 1204 most-common words\n", "INFO - 15:36:00: downsampling leaves estimated 199419 word corpus (45.5% of prior 437848)\n", "INFO - 15:36:00: estimated required memory for 3315 words and 300 dimensions: 9613500 bytes\n", "INFO - 15:36:00: resetting layer weights\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time to build vocab: 0.04 mins\n" ] } ], "source": [ "from gensim.models import Word2Vec\n", "w2v_model = Word2Vec(min_count=20, window=2, size=300, sample=6e-5, \n", " alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-1)\n", "\n", "# Building the Vocabulary Table\n", "# Word2Vec requires us to build the vocabulary table \n", "# (simply digesting all the words and filtering out the unique words, and doing some basic counts on them)\n", "t = time()\n", "w2v_model.build_vocab(sentences, progress_per=10000)\n", "print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO - 15:36:00: training model with 3 workers on 3315 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2\n", "INFO - 15:36:01: EPOCH 1 - PROGRESS: at 39.39% examples, 78203 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:02: EPOCH 1 - PROGRESS: at 86.15% examples, 84963 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:02: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:02: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:02: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:02: EPOCH - 1 : training on 523645 raw words (199755 effective words) took 2.3s, 86796 effective words/s\n", "INFO - 15:36:03: EPOCH 2 - PROGRESS: at 45.82% examples, 90257 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:05: EPOCH 2 - PROGRESS: at 88.13% examples, 85172 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:05: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:05: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:05: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:05: EPOCH - 2 : training on 523645 raw words (198855 effective words) took 2.3s, 86358 effective words/s\n", "INFO - 15:36:06: EPOCH 3 - PROGRESS: at 41.54% examples, 82756 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:07: EPOCH 3 - PROGRESS: at 84.22% examples, 82341 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:07: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:07: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:07: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:07: EPOCH - 3 : training on 523645 raw words (199333 effective words) took 2.4s, 83467 effective words/s\n", "INFO - 15:36:08: EPOCH 4 - PROGRESS: at 43.69% examples, 86246 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:09: EPOCH 4 - PROGRESS: at 90.04% examples, 87661 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:09: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:09: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:09: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:09: EPOCH - 4 : training on 523645 raw words (199442 effective words) took 2.3s, 88133 effective words/s\n", "INFO - 15:36:10: EPOCH 5 - PROGRESS: at 43.69% examples, 85630 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:11: EPOCH 5 - PROGRESS: at 90.04% examples, 88778 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:12: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:12: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:12: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:12: EPOCH - 5 : training on 523645 raw words (199772 effective words) took 2.2s, 89311 effective words/s\n", "INFO - 15:36:13: EPOCH 6 - PROGRESS: at 43.69% examples, 86464 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:14: EPOCH 6 - PROGRESS: at 88.13% examples, 86912 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:14: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:14: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:14: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:14: EPOCH - 6 : training on 523645 raw words (199674 effective words) took 2.3s, 88173 effective words/s\n", "INFO - 15:36:15: EPOCH 7 - PROGRESS: at 43.69% examples, 87081 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:16: EPOCH 7 - PROGRESS: at 90.04% examples, 88962 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:16: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:16: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:16: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:16: EPOCH - 7 : training on 523645 raw words (199235 effective words) took 2.2s, 90257 effective words/s\n", "INFO - 15:36:17: EPOCH 8 - PROGRESS: at 43.69% examples, 84497 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:18: EPOCH 8 - PROGRESS: at 90.04% examples, 88015 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:18: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:18: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:18: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:18: EPOCH - 8 : training on 523645 raw words (199529 effective words) took 2.2s, 89366 effective words/s\n", "INFO - 15:36:19: EPOCH 9 - PROGRESS: at 45.82% examples, 87895 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:20: EPOCH 9 - PROGRESS: at 90.04% examples, 87242 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:21: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:21: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:21: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:21: EPOCH - 9 : training on 523645 raw words (198889 effective words) took 2.2s, 88585 effective words/s\n", "INFO - 15:36:22: EPOCH 10 - PROGRESS: at 45.82% examples, 89262 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:23: EPOCH 10 - PROGRESS: at 91.99% examples, 90315 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:23: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:23: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:23: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:23: EPOCH - 10 : training on 523645 raw words (199127 effective words) took 2.2s, 90887 effective words/s\n", "INFO - 15:36:24: EPOCH 11 - PROGRESS: at 41.54% examples, 82666 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:25: EPOCH 11 - PROGRESS: at 90.04% examples, 88063 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:25: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:25: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:25: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:25: EPOCH - 11 : training on 523645 raw words (199146 effective words) took 2.2s, 88544 effective words/s\n", "INFO - 15:36:26: EPOCH 12 - PROGRESS: at 41.54% examples, 82819 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:27: EPOCH 12 - PROGRESS: at 86.15% examples, 85256 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:27: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:27: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:27: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:27: EPOCH - 12 : training on 523645 raw words (199233 effective words) took 2.3s, 85932 effective words/s\n", "INFO - 15:36:29: EPOCH 13 - PROGRESS: at 47.89% examples, 91893 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:30: EPOCH 13 - PROGRESS: at 93.83% examples, 90291 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:30: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:30: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:30: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:30: EPOCH - 13 : training on 523645 raw words (199783 effective words) took 2.2s, 90916 effective words/s\n", "INFO - 15:36:31: EPOCH 14 - PROGRESS: at 45.82% examples, 88879 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:32: EPOCH 14 - PROGRESS: at 93.83% examples, 91737 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:32: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:32: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:32: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:32: EPOCH - 14 : training on 523645 raw words (199586 effective words) took 2.2s, 91999 effective words/s\n", "INFO - 15:36:33: EPOCH 15 - PROGRESS: at 39.39% examples, 79193 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:34: EPOCH 15 - PROGRESS: at 88.13% examples, 85931 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:34: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:34: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:34: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:34: EPOCH - 15 : training on 523645 raw words (199069 effective words) took 2.3s, 87451 effective words/s\n", "INFO - 15:36:35: EPOCH 16 - PROGRESS: at 45.82% examples, 86120 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:36: EPOCH 16 - PROGRESS: at 90.04% examples, 86970 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:36: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:36: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:36: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:36: EPOCH - 16 : training on 523645 raw words (199228 effective words) took 2.3s, 87862 effective words/s\n", "INFO - 15:36:37: EPOCH 17 - PROGRESS: at 43.69% examples, 86403 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:38: EPOCH 17 - PROGRESS: at 90.04% examples, 88756 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:39: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:39: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:39: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:39: EPOCH - 17 : training on 523645 raw words (199507 effective words) took 2.2s, 89617 effective words/s\n", "INFO - 15:36:40: EPOCH 18 - PROGRESS: at 41.54% examples, 80584 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:41: EPOCH 18 - PROGRESS: at 90.04% examples, 87118 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:41: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:41: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:41: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:41: EPOCH - 18 : training on 523645 raw words (199602 effective words) took 2.3s, 88206 effective words/s\n", "INFO - 15:36:42: EPOCH 19 - PROGRESS: at 45.82% examples, 89088 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:43: EPOCH 19 - PROGRESS: at 90.04% examples, 87161 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:43: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:43: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:43: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:43: EPOCH - 19 : training on 523645 raw words (199656 effective words) took 2.3s, 88199 effective words/s\n", "INFO - 15:36:44: EPOCH 20 - PROGRESS: at 45.82% examples, 88658 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:45: EPOCH 20 - PROGRESS: at 91.99% examples, 89779 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:45: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:45: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:45: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:45: EPOCH - 20 : training on 523645 raw words (199528 effective words) took 2.2s, 90283 effective words/s\n", "INFO - 15:36:46: EPOCH 21 - PROGRESS: at 41.54% examples, 82527 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:47: EPOCH 21 - PROGRESS: at 90.04% examples, 87982 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:48: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:48: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:48: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:48: EPOCH - 21 : training on 523645 raw words (199284 effective words) took 2.2s, 89021 effective words/s\n", "INFO - 15:36:49: EPOCH 22 - PROGRESS: at 41.54% examples, 82536 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:50: EPOCH 22 - PROGRESS: at 86.15% examples, 84963 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:50: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:50: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:50: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:50: EPOCH - 22 : training on 523645 raw words (198940 effective words) took 2.3s, 86840 effective words/s\n", "INFO - 15:36:51: EPOCH 23 - PROGRESS: at 45.82% examples, 88984 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:52: EPOCH 23 - PROGRESS: at 90.04% examples, 87060 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:52: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:52: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:52: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:52: EPOCH - 23 : training on 523645 raw words (199452 effective words) took 2.3s, 87866 effective words/s\n", "INFO - 15:36:53: EPOCH 24 - PROGRESS: at 43.69% examples, 84958 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:54: EPOCH 24 - PROGRESS: at 90.04% examples, 87942 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:54: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:54: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:54: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:55: EPOCH - 24 : training on 523645 raw words (199363 effective words) took 2.2s, 88733 effective words/s\n", "INFO - 15:36:56: EPOCH 25 - PROGRESS: at 41.54% examples, 82595 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:36:57: EPOCH 25 - PROGRESS: at 90.04% examples, 88227 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:57: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:57: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:57: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:57: EPOCH - 25 : training on 523645 raw words (199378 effective words) took 2.3s, 88375 effective words/s\n", "INFO - 15:36:58: EPOCH 26 - PROGRESS: at 41.54% examples, 79913 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:59: EPOCH 26 - PROGRESS: at 86.15% examples, 82995 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:36:59: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:36:59: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:36:59: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:36:59: EPOCH - 26 : training on 523645 raw words (199376 effective words) took 2.4s, 84789 effective words/s\n", "INFO - 15:37:00: EPOCH 27 - PROGRESS: at 41.54% examples, 81776 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:37:01: EPOCH 27 - PROGRESS: at 86.15% examples, 84585 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:37:01: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:37:01: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:37:01: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:37:01: EPOCH - 27 : training on 523645 raw words (199427 effective words) took 2.3s, 85989 effective words/s\n", "INFO - 15:37:02: EPOCH 28 - PROGRESS: at 45.82% examples, 90238 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:37:04: EPOCH 28 - PROGRESS: at 91.99% examples, 89660 words/s, in_qsize 1, out_qsize 0\n", "INFO - 15:37:04: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:37:04: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:37:04: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:37:04: EPOCH - 28 : training on 523645 raw words (199613 effective words) took 2.2s, 90478 effective words/s\n", "INFO - 15:37:05: EPOCH 29 - PROGRESS: at 41.54% examples, 81478 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:37:06: EPOCH 29 - PROGRESS: at 90.04% examples, 87387 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:37:06: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:37:06: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:37:06: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:37:06: EPOCH - 29 : training on 523645 raw words (199377 effective words) took 2.3s, 88449 effective words/s\n", "INFO - 15:37:07: EPOCH 30 - PROGRESS: at 43.69% examples, 84277 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:37:08: EPOCH 30 - PROGRESS: at 88.13% examples, 84638 words/s, in_qsize 0, out_qsize 0\n", "INFO - 15:37:08: worker thread finished; awaiting finish of 2 more threads\n", "INFO - 15:37:08: worker thread finished; awaiting finish of 1 more threads\n", "INFO - 15:37:08: worker thread finished; awaiting finish of 0 more threads\n", "INFO - 15:37:08: EPOCH - 30 : training on 523645 raw words (199395 effective words) took 2.3s, 86538 effective words/s\n", "INFO - 15:37:08: training on a 15709350 raw words (5981554 effective words) took 68.1s, 87845 effective words/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Time to train the model: 1.13 mins\n" ] } ], "source": [ "# Training of the model (Parameters of the training)\n", "# total_examples = int : Count of sentences\n", "# epochs = int : Number of iterations (epochs) over the corpus - [10, 20, 30]\n", "t = time()\n", "w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)\n", "print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO - 15:37:08: precomputing L2-norms of word weight vectors\n" ] } ], "source": [ "# 추가로 학습하지 않을 경우 init_sims()을 호출하여 메모리 효율을 높입니다.\n", "# which will make the model much more memory-efficient:\n", "w2v_model.init_sims(replace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **5 W2V 학습한 모델 살펴보기**\n", "앞에서 학습이 완료된 모델을 저장 및 활용할 수 있습니다." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('depressed', 0.800049901008606),\n", " ('sweetheart', 0.7771680355072021),\n", " ('snuggle', 0.7697296142578125),\n", " ('marge', 0.7636249661445618),\n", " ('terrific', 0.7587988376617432),\n", " ('good_friend', 0.7575525045394897),\n", " ('gee', 0.7561341524124146),\n", " ('hammock', 0.7530875205993652),\n", " ('feel_well', 0.7504291534423828),\n", " ('becky', 0.7494775652885437)]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 메인 캐릭터 호머와 연관성 높은 단어들을 호출 합니다\n", "w2v_model.wv.most_similar(positive=[\"homer\"])\n", "# w2v_model.wv.most_similar(positive=[\"marge\"])\n", "# w2v_model.wv.most_similar(positive=[\"bart\"])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('united_state', 0.785035252571106),\n", " ('congratulation', 0.7744359374046326),\n", " ('select', 0.773761510848999),\n", " ('pleased', 0.7718425989151001),\n", " ('council', 0.7696194648742676),\n", " ('aboard', 0.7508918046951294),\n", " ('recent', 0.748868465423584),\n", " ('robert', 0.7483236789703369),\n", " ('governor', 0.7412874698638916),\n", " ('easily', 0.7393561601638794)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# bi_gram 인 호머심슨을 대상으로 확인 합니다\n", "w2v_model.wv.most_similar(positive=[\"homer_simpson\"])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.72318304" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 단어간 유사도를 측정 합니다.\n", "# w2v_model.wv.similarity(\"moe_'s\", 'tavern')\n", "w2v_model.wv.similarity('maggie', 'baby')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.66021216" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_model.wv.similarity('bart', 'nelson')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/models/keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a \"sequence\" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.\n", " vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)\n" ] }, { "data": { "text/plain": [ "'nelson'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 단어들 중 연관성이 낮은 단어를 선별 합니다.\n", "# w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])\n", "w2v_model.wv.doesnt_match([\"nelson\", \"bart\", \"milhouse\"])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('see', 0.6492701768875122),\n", " ('admire', 0.6330732107162476),\n", " ('care', 0.627632737159729)]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_model.wv.most_similar(positive=[\"woman\", \"homer\"], negative=[\"marge\"], topn=3)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('lisa', 0.7655842304229736),\n", " ('hearing', 0.6929087042808533),\n", " ('parent', 0.6803374290466309)]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_model.wv.most_similar(positive=[\"woman\", \"bart\"], negative=[\"man\"], topn=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **6 t-sne 를 활용한 시각화**\n", "학습한 모델을 보다 포괄적으로 검증 가능한, 시각화 방법을 활용 합니다" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt \n", "import seaborn as sns\n", "sns.set_style(\"darkgrid\")\n", "from sklearn.decomposition import PCA\n", "from sklearn.manifold import TSNE\n", "\n", "def tsnescatterplot(model, word, list_names):\n", " \"\"\" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,\n", " its list of most similar words, and a list of words.\"\"\"\n", " arrays = np.empty((0, 300), dtype='f')\n", " word_labels = [word]\n", " color_list = ['red']\n", " # adds the vector of the query word\n", " arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)\n", " # gets list of most similar words\n", " close_words = model.wv.most_similar([word])\n", " # adds the vector for each of the closest words to the array\n", " for wrd_score in close_words:\n", " wrd_vector = model.wv.__getitem__([wrd_score[0]])\n", " word_labels.append(wrd_score[0])\n", " color_list.append('blue')\n", " arrays = np.append(arrays, wrd_vector, axis=0)\n", " # adds the vector for each of the words from list_names to the array\n", " for wrd in list_names:\n", " wrd_vector = model.wv.__getitem__([wrd])\n", " word_labels.append(wrd)\n", " color_list.append('green')\n", " arrays = np.append(arrays, wrd_vector, axis=0)\n", " # Reduces the dimensionality from 300 to 50 dimensions with PCA\n", " reduc = PCA(n_components=15).fit_transform(arrays)\n", " # Finds t-SNE coordinates for 2 dimensions\n", " np.set_printoptions(suppress=True)\n", " Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)\n", " # Sets everything up to plot\n", " df = pd.DataFrame({'x': [x for x in Y[:, 0]],\n", " 'y': [y for y in Y[:, 1]],\n", " 'words': word_labels, 'color': color_list})\n", " fig, _ = plt.subplots()\n", " fig.set_size_inches(9, 9) \n", " # Basic plot\n", " p1 = sns.regplot(data=df, x=\"x\", y=\"y\", fit_reg=False, marker=\"o\",\n", " scatter_kws={'s': 40,'facecolors': df['color']}) \n", " # Adds annotations one by one with a loop\n", " for line in range(0, df.shape[0]):\n", " p1.text(df[\"x\"][line], df['y'][line],\n", " ' ' + df[\"words\"][line].title(),\n", " horizontalalignment = 'left',\n", " verticalalignment = 'bottom', size='medium',\n", " color=df['color'][line], weight='normal').set_size(15)\n", " plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)\n", " plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)\n", " plt.title('t-SNE visualization for {}'.format(word.title()))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "tsnescatterplot(w2v_model, 'homer', \n", " ['dog', 'bird', 'ah', 'maude', 'bob', 'mel', 'apu', 'duff'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 4 }